def accessible(url): u = urlparse(url) if u.netloc not in robots_cache: resp = requests.get('http://%s/robots.txt' % u.netloc) rp = RobotFileParser() rp.parse(resp.content.splitlines()) robots_cache[u.netloc] = rp return robots_cache[u.netloc].can_fetch('*', url)
def parse_robots(self, netloc, content): """ Parse the given robots.txt content and store against the given domain. If content is None, any URL will be allowed. """ robot = RobotFileParser() if content is not None: robot.parse(content.split("\n")) self._robots[netloc] = robot
def __init__(self, url): self.page_url = url self.parsed_url = urlparse.urlparse(url) self.lang = "" self.isDownload = False self.title = "" self.text = "" self.soup = None self.robot = RobotFileParser()
def _get_robot_parser(self): if self.robot_parser_pickle is not None: return pickle.loads(base64.b64decode(self.robot_parser_pickle)) else: parser = RobotFileParser() parser.set_url(self.protocol + "://" + self.domain + "/robots.txt") self.robot_parser = parser return parser
def _get_robot_parser(self): try: return pickle.loads(str(self.robot_parser_pickle)) except (TypeError, IndexError): parser = RobotFileParser() parser.set_url(str(self.protocol) + "://" + str(self.domain) + \ "/robots.txt") self.robot_parser = parser return parser
def parse_robots(self, netloc, content): """ Parse the given robots.txt content and store against the given domain. If content is None, any URL will be allowed. """ robot = RobotFileParser() if content is not None: robot.parse(content.split("\n")) self.execute("UPDATE domain SET robots=? WHERE netloc=?", dumps(robot), netloc)
def _allowed_to_open(self, url): host = urlparse.urlsplit(url)[1] robots_url = urlparse.urlunsplit(('http', host, '/robots.txt', '', '')) rp = RobotFileParser(robots_url) try: rp.read() except: return False return rp.can_fetch(self._agent_name, url)
def http_open(self, request): #request -- urllib2.Request url = request.get_full_url() host = urlsplit(url)[1] robots_url = urlunsplit(('http', host, '/robots.txt', '', '')) rp = RobotFileParser(robots_url) rp.read() if not rp.can_fetch(self.agentname, url): raise RuntimeError('Forbidden by robots.txt') return urllib2.HTTPHandler.http_open(self, request)
def http_open(self, req): url = req.get_full_url() host = urlsplit(url)[1] robots_url = urlunsplit(('http', host, '/robots.txt', '', '')) robotfileparser = RobotFileParser(robots_url) robotfileparser.read() if not robotfileparser.can_fetch(self.crawlername, url): raise RuntimeError('Forbidden by robots.txt') return urllib2.HTTPHandler.http_open(self, req)
def check_robots(self, url): '''check the robots.txt in this url's domain''' hostname = urlparse(url).netloc if hostname not in self.domain_list.keys(): # no records in domain_list rp = RobotFileParser('http://%s/robots.txt' % hostname) print("%s: fetching %s" % (url, rp.url)) try: rp.read() # get new robots.txt except IOError, e: # url's server not available(connection timeout) log.error(str(e)) rp.disallow_all = True # reject all request self.domain_list[hostname] = rp # add domain entry into domain_list
def try_add_robot(self, url): parsed_url = urlparse(url) if parsed_url.netloc not in self.robots: try: robot_url = parsed_url.scheme + '://' + parsed_url.netloc + \ '/robots.txt' rp = RobotFileParser(robot_url) rp.read() self.robots[parsed_url.netloc] = rp except IOError as e: print str(e) except Exception as e: print str(e)
def check_robots(self, url): '''check the robots.txt in this url's domain''' hostname = urlparse(url).netloc if hostname not in self.domain_list.keys( ): # no records in domain_list rp = RobotFileParser('http://%s/robots.txt' % hostname) print("%s: fetching %s" % (url, rp.url)) try: rp.read() # get new robots.txt except IOError, e: # url's server not available(connection timeout) log.error(str(e)) rp.disallow_all = True # reject all request self.domain_list[ hostname] = rp # add domain entry into domain_list
def getRobots(url): parsed = urlparse(url) robots_url = parsed.scheme + '://' + parsed.netloc + '/robots.txt' if robots_url not in robots: rp = RobotFileParser() try: r = requests.get(robots_url, verify=False, timeout=1) r.raise_for_status() except Exception: rp.parse('') else: rp.parse(r.text) #print " new robot at " + robots_url robots[robots_url] = rp return robots[robots_url]
def robots_precheck(self, url): """ If we have the robots.txt file available, check it to see if the request is permissible. This does not fetch robots.txt. """ fetcher = RedFetcher(url) robots_txt = fetcher.fetch_robots_txt(url, lambda a:a, network=False) if robots_txt == "": return True checker = RobotFileParser() checker.parse(robots_txt.splitlines()) return checker.can_fetch(UA_STRING, url)
def __init__(self, starturl, index_html='', maxlevel=1, cookie_file=None, acldb=None, urldb=None, default_charset=None, delay=0, timeout=300, debug=0): (proto, self.hostport, _x, _y, _z) = urlsplit(starturl) assert proto == 'http' #Thread.__init__(self) self.debug = debug self.index_html = index_html if cookie_file: self.cookiejar = MozillaCookieJar(cookie_file) self.cookiejar.load() else: self.cookiejar = None self.robotstxt = RobotFileParser() self.robotstxt.set_url(urljoin(starturl, '/robots.txt')) self.robotstxt.read() self.conn = None self.urldb = urldb self.acldb = acldb self.curlevel = 0 self.delay = delay self.timeout = timeout self.default_charset = default_charset if starturl.endswith('/'): starturl += self.index_html self.urls = [(starturl, maxlevel)] self.crawled = {} # 1:injected, 2:crawled return
def __init__(self, main_page=None, robotrules=True): """ Constuctor method that initializes the members that are used during crawling process :param main_page: The root page that needs to be crawled for generation of sitemap """ logging.info("Consider Robot.txt ? ==> "+str(robotrules)) self.robotrules = robotrules self.site_map = {} # map that records the visits of urls, datemodified and assets self.network = {} # map that maintains the network/graph of webpages visited # The intention of this map is for visual rendering using d3.js self.unvisited = set([]) # a set to keep the list of urls yet to be visited self.start_page = None # the root page, this is used to avoid cycle and keeping crawl # process limited to single domain. self.robot_txt_rules = None if main_page: self.unvisited.add(main_page) try: self.start_page = urlparse(main_page).netloc except: logging.error("Improper URL, Please provide a Valid Url:"+main_page) exit(0) if self.robotrules == "True": try: logging.info("robot.txt respected") self.robot_txt_rules = RobotFileParser() self.robot_txt_rules.set_url(main_page + "/robots.txt") self.robot_txt_rules.read() except: logging.error("Unable to read the robot.txt file") self.robotrules = False # error reading robot.txt, ignore it forever
def checkRobots(self): if self.domain[len(self.domain)-1] != "/": self.domain += "/" request = Request(self.domain+"robots.txt", headers={"User-Agent":config.crawler_user_agent}) self.rp = RobotFileParser() self.rp.set_url(self.domain+"robots.txt") self.rp.read()
def __init__(self): self.rp = RobotFileParser() self.rp.set_url('https://www.timeanddate.com/robots.txt') self.rp.read() if not self.rp.can_fetch('WasThereAHoliday', init_url): raise RuntimeError('Scrapping forbidden due to robots.txt file') self.countries = self.get_countries(self.get_page(init_url)) try: # removing entries which are not countries self.countries.remove('un') except ValueError: pass try: # removing entries which are not countries self.countries.remove('world') except ValueError: pass
def link_crawler(seed_url, link_regex): import re crawler_queue = [seed_url] seen = {} while crawler_queue: url = crawler_queue.pop() html = download(url, now=1) from robotparser import RobotFileParser rp = RobotFileParser().set_url(web_url.join('robots.txt')).read() for link in get_links(url): depth = seen.get(link, 1) seen[link] = depth if re.match(link_regex, link) and link not in seen and rp.can_fetch( user_agent, web_url) and seen[link] != max_try: seen[link] = depth + 1 link = urlparse.urljoin(seed_url, link) crawler_queue.append(link)
class Host(object): ''' Represents one host. Responsible for parsing and analyzing ``robots.txt``. :param hostname: the name of the host extracted from an URL. ''' def __init__(self, hostname): self.hostname = hostname self.rp = RobotFileParser() self.rp.set_url('http://%s/robots.txt' % self.hostname) def url_allowed(self, url): ''' Checks if the given url is allowed to crawl. :param url: URL to check. ''' return self.rp.can_fetch(USER_AGENT, url)
def test_parse(self): from robotparser import RobotFileParser rules = RobotFileParser() rules.set_url("http://www.sogou.com/robots.txt") rules.read() self.assertEqual( rules.can_fetch("mozilla", "http://www.sogou.com/sohu/robots.txt"), False)
def crawl(self, seed_url, max_urls=30, max_depth=1, obey_robots=False, max_size=1000000, force_html=True, **kwargs): """Crawl website html and return list of URLs crawled seed_url: url to start crawling from max_urls: maximum number of URLs to crawl (use None for no limit) max_depth: maximum depth to follow links into website (use None for no limit) obey_robots: whether to obey robots.txt max_size is passed to get() and is limited to 1MB by default force_text is passed to get() and is set to True by default so only crawl HTML content **kwargs is passed to get() """ user_agent = kwargs.get("user_agent", self.user_agent) server = "http://" + extract_domain(seed_url) robots = RobotFileParser() if obey_robots: robots.parse(self.get(server + "/robots.txt").splitlines()) # load robots.txt outstanding = [(seed_url, 0), (server, 0)] # which URLs need to crawl crawled = [] # urls that have crawled while outstanding: # more URLs to crawl if len(crawled) == max_urls: break url, cur_depth = outstanding.pop(0) if url not in crawled: html = self.get(url, max_size=max_size, force_html=force_html, **kwargs) crawled.append(url) if max_depth is None or cur_depth < max_depth: # continue crawling for scraped_url in re.findall(re.compile("<a[^>]+href=[\"'](.*?)[\"']", re.IGNORECASE), html): if "#" in scraped_url: scraped_url = scraped_url[ : scraped_url.index("#") ] # remove internal links to prevent duplicates if os.path.splitext(scraped_url)[ -1 ].lower() not in Download.IGNORED_EXTENSIONS and robots.can_fetch(user_agent, scraped_url): scraped_url = urljoin(server, scraped_url) # support relative links # check if same domain or sub-domain this_server = extract_domain(scraped_url) if this_server and (this_server in server or server in this_server): outstanding.append((scraped_url, cur_depth + 1)) return crawled
def checkRobots(URL): time.sleep(1) parsed = urlparse(URL) robotsUrl = parsed.scheme + "://" + parsed.netloc + "/robots.txt" robotParser = RobotFileParser() robotParser.set_url(robotsUrl) robotParser.read() result = robotParser.can_fetch("*", URL) return result
def __init__(self, starturl, index_html='', maxlevel=1, cookie_file=None, acldb=None, urldb=None, default_charset=None, delay=0, timeout=300, debug=0): (proto, self.hostport, _x, _y, _z) = urlsplit(starturl) # assert proto == 'http' #Thread.__init__(self) self.debug = debug self.index_html = index_html if cookie_file: self.cookiejar = MozillaCookieJar(cookie_file) self.cookiejar.load() else: self.cookiejar = None self.robotstxt = RobotFileParser() self.robotstxt.set_url(urljoin(starturl, '/robots.txt')) try: self.robotstxt.read() except IOError: pass self.conn = None self.urldb = urldb self.acldb = acldb self.curlevel = 0 self.delay = delay self.timeout = timeout self.default_charset = default_charset if starturl.endswith('/'): starturl += self.index_html self.urls = [(starturl, maxlevel)] self.crawled = {} # 1:injected, 2:crawled return
def run_continue(self, robots_txt): """ Continue after getting the robots file. TODO: refactor callback style into events. """ if robots_txt == "": # empty or non-200 pass else: checker = RobotFileParser() checker.parse(robots_txt.decode('ascii', 'replace').encode('ascii', 'replace').splitlines()) if not checker.can_fetch(UA_STRING, self.request.uri): self.response.http_error = RobotsTxtError() self.finish_task() return # TODO: show error? if 'user-agent' not in [i[0].lower() for i in self.request.headers]: self.request.headers.append( (u"User-Agent", UA_STRING)) self.exchange = self.client.exchange() self.exchange.on('response_start', self._response_start) self.exchange.on('response_body', self._response_body) self.exchange.on('response_done', self._response_done) self.exchange.on('error', self._response_error) if self.status_cb and self.name: self.status_cb("fetching %s (%s)" % ( self.request.uri, self.name )) req_hdrs = [ (k.encode('ascii', 'replace'), v.encode('latin-1', 'replace')) \ for (k, v) in self.request.headers ] self.exchange.request_start( self.request.method, self.request.uri, req_hdrs ) self.request.start_time = thor.time() if self.request.payload != None: self.exchange.request_body(self.request.payload) self.transfer_out += len(self.request.payload) self.exchange.request_done([])
def can_fetch(self, url): host, path = urlparse.urlparse(url)[1:3] if (self.rules.has_key(host)): return self.rules[host].can_fetch(self.agent, url) else: rp = RobotFileParser() robot_url = "http://" + host + "/robots.txt" rp.set_url(robot_url) rp.read() self.rules[host] = rp return rp.can_fetch(self.agent, url)
def is_page_robot_scannable(self): """ Returns a boolean that tells whether the page is robot scrapeable. """ robotcheck = RobotFileParser() robotcheck.set_url(self.urlparse[0]+'://'+self.urlparse[1]+'/robots.txt') robotcheck.read() return robotcheck.can_fetch(settings.SPIDER_USER_AGENT, self.url)
def __init__(self, main_page=None, robotrules=True): """ Constuctor method that initializes the members that are used during crawling process :param main_page: The root page that needs to be crawled for generation of sitemap """ logging.info("Consider Robot.txt ? ==> " + str(robotrules)) self.robotrules = robotrules self.site_map = { } # map that records the visits of urls, datemodified and assets self.network = { } # map that maintains the network/graph of webpages visited # The intention of this map is for visual rendering using d3.js self.unvisited = set( []) # a set to keep the list of urls yet to be visited self.start_page = None # the root page, this is used to avoid cycle and keeping crawl # process limited to single domain. self.robot_txt_rules = None if main_page: self.unvisited.add(main_page) try: self.start_page = urlparse(main_page).netloc except: logging.error("Improper URL, Please provide a Valid Url:" + main_page) exit(0) if self.robotrules == "True": try: logging.info("robot.txt respected") self.robot_txt_rules = RobotFileParser() self.robot_txt_rules.set_url(main_page + "/robots.txt") self.robot_txt_rules.read() except: logging.error("Unable to read the robot.txt file") self.robotrules = False # error reading robot.txt, ignore it forever
class spider(object): CurLink = "" linknText = [] headings = [] def __init__(self, link): self.CurLink = link self.r = RobotFileParser() def crawl(self): self.r.set_url(urlparse.unquote(self.CurLink)) self.r.read() self.html = urlopen(self.CurLink).read() self.bs = BeautifulSoup(self.html, "lxml") for i in self.bs.findAll("h1", text=True): self.headings.append(i.text) for i in self.bs.findAll("h2", text=True): self.headings.append(i.text) for i in self.bs.findAll("h3", text=True): self.headings.append(i.text) for i in self.bs.findAll("h4", text=True): self.headings.append(i.text) for i in self.bs.findAll("h5", text=True): self.headings.append(i.text) for i in self.bs.findAll("h6", text=True): self.headings.append(i.text) for link in self.bs.findAll('a', href=True): aLink = urlparse.urljoin(self.CurLink, link['href']) if (self.r.can_fetch("*", aLink)): self.linknText.append({ "URL": aLink, "AnchorText": link.string })
def __init__(self, url): self.url = urlManip.cleanURL(url) self.pages = [] self.suggestions = set() self.loaded = False logger.info("Loading %s..." % (self.url)) try: requests.get(self.url) self.loaded = True except IOError as e: logger.error("%s cannot be loaded: %s" % (self.url, e)) # if the website can be loaded if self.loaded == True: logger.info("Load successful. Generating suggestions...") # get robots.txt rp = RobotFileParser(self.url + "robots.txt") try: rp.read() except IOError: logger.warning("robots.txt cannot be found.") # get home page self.pages.append(Page(self.url)) # get all pages on homepage self.pages[0].load() for link in self.pages[0].internalLinks: if rp.can_fetch("*", link): if link[:4] == 'http': self.pages.append(Page(link)) else: self.pages.append(Page(self.url + link)) else: logger.debug("Ignoring %s based on robots.txt" % link)
def page_robot_scannable(self): """ Checks whether the page is allowed to be crawled """ if self.need_to_be_scraped is True: # REFACTOR to remove try statement. try: headers = {'User-agent':settings.SPIDER_USER_AGENT} self.urlparse = urlparse.urlparse(self.url) self.robotcheck = RobotFileParser() self.robotcheck.set_url('http://'+self.urlparse[1]+'/robots.txt') # Only works with http right now. self.robotcheck.read() self.need_to_be_scraped = self.robotcheck.can_fetch(settings.SPIDER_USER_AGENT, self.url) except: self.need_to_be_scraped = False
def disallow(self, url): """ TO BE DONE """ robotFile = urljoin(url, "/robots.txt") # key = hashlib.sha1(robotFile).hexdigest() if (not self._dict.has_key(key)): self._dict[key] = RobotFileParser(robotFile) try: self._dict[key].read() except: self._dict[key] = None result = self._dict[key] is None or not self._dict[key].can_fetch( self._userAgent, url) return result
def checkRobots(URL): time.sleep(1) parsed = urlparse(URL) robotsUrl = parsed.scheme + "://"+ parsed.netloc+"/robots.txt" robotParser = RobotFileParser() robotParser.set_url(robotsUrl) robotParser.read() result = robotParser.can_fetch("*",URL) return result
def get_robots(url): ''' Initialize robots parser for this domain :param url: :return: ''' rp = RobotFileParser() rp.set_url(urlparse.urljoin(url, '/robots.txt')) rp.read() return rp
def can_fetch(self,url): host,path=urlparse.urlparse(url)[1:3] if (self.rules.has_key(host)): return self.rules[host].can_fetch(self.agent,url) else: rp=RobotFileParser() robot_url="http://"+host+"/robots.txt" rp.set_url(robot_url) rp.read() self.rules[host]=rp return rp.can_fetch(self.agent,url)
class HolidayScrapper: def __init__(self): self.rp = RobotFileParser() self.rp.set_url('https://www.timeanddate.com/robots.txt') self.rp.read() if not self.rp.can_fetch('WasThereAHoliday', init_url): raise RuntimeError('Scrapping forbidden due to robots.txt file') self.countries = self.get_countries(self.get_page(init_url)) try: # removing entries which are not countries self.countries.remove('un') except ValueError: pass try: # removing entries which are not countries self.countries.remove('world') except ValueError: pass def get_data(self): all_data = pd.DataFrame() for cntr in self.countries: print 'Fetching data for ' + cntr try: url = 'https://www.timeanddate.com/holidays/' + cntr + '/2016#!hol=8389401' if not self.rp.can_fetch('WasThereAHoliday', url): raise RuntimeError( 'Scrapping forbidden due to robots.txt file') soup = self.get_page('https://www.timeanddate.com/holidays/' + cntr + '/2016#!hol=8389401') html_table = soup.find('table') df_table = pd.read_html(str(html_table))[0] df_table['country'] = cntr all_data = all_data.append(df_table) except ValueError: print 'Problem occured when fetching data for ' + cntr pass return all_data @staticmethod def get_page(url): page = requests.get(url, headers=headers) soup = BeautifulSoup(page.text, 'lxml') return soup @staticmethod def get_countries(soup): countries = [] select_list = soup.find(id="co") for cntr in select_list.children: countries.append(cntr['value']) return countries
def robots_check(url): # creating url for robots.txt root_url = tld.get_tld(url) prefix = "http://www." suffix = "/robots.txt" robots_url = prefix + root_url + suffix # checking url validity rp = RobotFileParser() rp.set_url(robots_url) rp.read() return rp.can_fetch("*", url)
def __init__(self, starturl, callback, callpre=None, callfail=None, concount=MAXCONCOUNT, depth=2, accept_url_patterns=None, reject_url_patterns=None): self.concount = concount self.callback = callback self.callpre = callpre self.callfail = callfail self.depth = depth self.starturl = starturl self.baseurl = URL.baseurl(starturl) self.urls = [] self.crawled = {} self.link_title_db = LinkTitleDB() self.accept_url_patterns = accept_url_patterns self.reject_url_patterns = reject_url_patterns self.robotstxt = RobotFileParser() self.robotstxt.set_url(urljoin(starturl, '/robots.txt')) self.referer = starturl try: self.robotstxt.read() except: logger.debug(Traceback())
def _get_soup(path): """Gets soup from the given path, respecting robots.txt""" full_path = BASE_URL + path # Set a user-agent user_agent = 'dcnotify/%s' % __version__ http_headers = {'User-Agent': '%s' % user_agent} # Honor robots.txt robots = RobotFileParser() robots.set_url("%s/robots.txt" % BASE_URL) robots.read() if not robots.can_fetch(user_agent, full_path): raise ValueError("Path disallowed by robots.txt") # Make a make a request, raising any HTTP errors that might occur request = get(full_path, headers=http_headers) request.raise_for_status() return bs(request.text)
def can_read(url): domain = domain_name(url) if domain not in Permissions: rp = RobotFileParser() rp.set_url(urljoin('http://' + domain, 'robots.txt')) try: rp.read() except: return False Permissions[domain] = rp res = False try: res = Permissions[domain].can_fetch("*", url) except: return False return res
def urlopen(self, host): robo_url = host.get_robots_url() print self.robotdict cached_parser = self.robotdict.get(robo_url) if cached_parser: logging.info("Found in Cache: " + robo_url) else: logging.info("Fetching: " + robo_url) cached_parser = RobotFileParser() self.robotdict.put(robo_url, cached_parser) cached_parser.set_url(robo_url) cached_parser.read() if cached_parser.can_fetch('*', host. get_url()): print 'Going to fetch:', host.get_url() return self.fetch_file(host.get_url()) else: logging.info("Forbidden by Robots.txt") return None
def __init__(self, file=None): RobotFileParser.__init__(self) self._file = file
def parse_robots(self, robots_text): self.robots_parser = RobotFileParser(robots_text) self.robots_parser.read()
class CrawlerProcess: def __init__(self, index, lock): self.current_process_id = index self.lock = lock number_of_retries = 0 #print("[CREATED CRAWLER PROCESS]", self.current_process_id) # Create the chrome driver with which we will fetch and parse sites chrome_options = webdriver.ChromeOptions() chrome_options.add_argument("headless") self.driver = webdriver.Chrome(chrome_options=chrome_options) """ site is a dictionary with all the fields from the database (domain, robots_content, sitemap_content) """ self.site = None """ robots_parser is an object which allows us to use the robots.txt file """ self.robots_parser = None """ Holds all the pages which will be added to the frontier at the end of each run """ self.pages_to_add_to_frontier = [] """ current_page is a dictionary with an id (database id for updating) and url field """ self.current_page = database_handler.get_page_from_frontier(self.lock) """ If a page was fetched from the frontier the crawler can continue, otherwise try again in DELAY seconds If the frontier is still empty after MAX_NUMBER_OF_RETRIES was reached, we can assume that the frontier is really empty and no crawler process is going to insert new pages """ while self.current_page or number_of_retries < MAX_NUMBER_OF_RETRIES: if self.current_page: number_of_retries = 0 try: self.crawl() except Exception as error: print( "[CRAWLER PROCESS] An unhandled error occurred while parsing page: {}" .format(self.current_page["url"]), error) else: # No page was fetched from the frontier, try again in DELAY seconds number_of_retries += 1 print( "[CRAWLER PROCESS] Frontier is empty, retrying in 10 seconds", self.current_process_id) time.sleep(DELAY) # Reset all variables after a page was successfully transferred from the frontier self.current_page = database_handler.get_page_from_frontier( self.lock) self.site = None self.robots_parser = None self.pages_to_add_to_frontier = [] self.quit() print( "[STOPPED CRAWLER PROCESS] Frontier is empty after several tries", self.current_process_id) def crawl(self): #print(" {} - [CRAWLING PAGE]".format(self.current_process_id), self.current_page["url"]) domain = self.get_domain_url(self.current_page["url"]) self.site = database_handler.get_site(domain) if self.site is None: self.create_site(domain) else: if self.site["robots_content"] is not None: # Create robots_parser from robots.txt saved in the database self.parse_robots(self.site["robots_content"]) self.current_page["site_id"] = self.site["id"] self.current_page["accessed_time"] = datetime.now() if self.allowed_to_crawl_current_page( self.current_page["url"]) is False: #print(" [CRAWLING] Robots do not allow this page to be crawled: {}".format(self.current_page["url"])) self.current_page["page_type_code"] = PAGE_TYPES["disallowed"] self.current_page["http_status_code"] = 500 database_handler.remove_page_from_frontier(self.current_page) return else: # If a crawl delay is available in robots wait until the page can be crawled then continue self.wait_for_crawl_delay_to_elapse() # The crawler is allowed to crawl the current site, therefore we can perform a request page_response = self.fetch_response(self.current_page["url"]) if page_response: # No errors while fetching the response content_type = "" if "content-type" in page_response.headers: # Content type is not necessarily always present (e. g. when Transfer-Encoding is set) content_type = page_response.headers['content-type'] self.current_page["http_status_code"] = page_response.status_code if CONTENT_TYPES["HTML"] in content_type: # We got an HTML page html_content = self.fetch_rendered_page_source( self.current_page["url"]) if html_content is not None: if self.is_duplicate_page(html_content): print( " [CRAWLING] Found page duplicate, that has already been parsed: ", self.current_page["url"]) self.current_page["page_type_code"] = PAGE_TYPES[ "duplicate"] self.current_page[ "hash_content"] = hash_driver.create_content_hash( html_content) else: # page is not treated as duplicate page - insert hash signature to db database_handler.insert_page_signatures( self.current_page["id"], self.current_page["hash_signature"]) self.current_page["page_type_code"] = PAGE_TYPES[ "html"] self.current_page["html_content"] = html_content self.current_page[ "hash_content"] = hash_driver.create_content_hash( html_content) parsed_page = self.parse_page( self.current_page["html_content"]) if len(parsed_page['links']): for link in parsed_page['links']: self.add_page_to_frontier_array(link) if len(parsed_page['images']): for image_url in parsed_page['images']: self.add_page_to_frontier_array(image_url) else: # An error occurred while rendering page self.current_page["page_type_code"] = PAGE_TYPES["error"] self.current_page["http_status_code"] = 500 elif CONTENT_TYPES["IMG"] in content_type: # We can be pretty sure that we have an image self.current_page["page_type_code"] = PAGE_TYPES["image"] filename = self.get_image_filename(self.current_page["url"]) image_data = { "page_id": self.current_page["id"], "content_type": content_type, "data": page_response.content, "data_size": len(page_response.content), "accessed_time": datetime.now(), "filename": filename } database_handler.insert_image_data(image_data) else: # The crawler detected a non-image binary file self.current_page["page_type_code"] = PAGE_TYPES["binary"] data_type_code = None # Find the correct data_type_code from all the content types for code, value in CONTENT_TYPES.items(): if content_type == value: data_type_code = code if data_type_code is None: # The content type is not in the allowed values, therefore we can ignore it testing = None #print(" [CRAWLING] Page response content-type is not in CONTENT_TYPES: ", content_type) else: page_data = { "page_id": self.current_page["id"], "data_type_code": data_type_code, "data": page_response.content, "data_size": len(page_response.content) } database_handler.insert_page_data(page_data) else: # An error occurred while fetching page (SSL certificate error, timeout, etc.) self.current_page["page_type_code"] = PAGE_TYPES["error"] self.current_page["http_status_code"] = 500 # Update the page in the database, remove FRONTIER type and replace it with the correct one database_handler.remove_page_from_frontier(self.current_page) # Add all the links from the page and sitemap to the frontier database_handler.add_pages_to_frontier(self.pages_to_add_to_frontier) #print(" {} - [CRAWLING] Finished crawling".format(self.current_process_id)) """ Fetch a response from the url, so that we get the status code and find out if any errors occur while fetching (some sites for example require a certificate to connect, some sites timeout, etc.) """ def fetch_response(self, url): try: response = requests.get(url) return response except requests.exceptions.RequestException as exception: print(" [CRAWLING - ERROR]", exception) return None """ Create a new site object and insert it into the database """ def create_site(self, domain): # We need to create a new site object self.site = {"domain": domain} robots_content = self.fetch_robots(domain) sitemap_content = None if robots_content is not None: # Create robots_parser from fetched robots.txt self.parse_robots(robots_content) sitemaps = self.robots_parser.get_sitemaps() if len(sitemaps) > 0: for sitemap_url in sitemaps: sitemap_content = self.fetch_sitemap(sitemap_url) if sitemap_content is not None: self.parse_sitemap(sitemap_content) self.site["robots_content"] = robots_content self.site["sitemap_content"] = sitemap_content # Insert the new site into database and return the id self.site["id"] = database_handler.insert_site(self.site) """ Fetch and render the site in the chrome driver then return the resulting html so that it can be saved in the current page html_content """ def fetch_rendered_page_source(self, url): try: self.driver.get(url) return self.driver.page_source except Exception as error: print(" [CRAWLING] Error while fetching rendered page source", error) return None """ Get the domain name of the current site so that we can check if the site is already in the database or if we have to create it """ def get_domain_url(self, url): parsed_uri = urlparse(url) return '{uri.scheme}://{uri.netloc}/'.format(uri=parsed_uri) """ Get the filename from an online image resource https://stackoverflow.com/questions/10552188/python-split-url-to-find-image-name-and-extension """ def get_image_filename(self, image_url): filename = image_url.split('/')[-1] return filename def fetch_robots(self, domain): response = self.fetch_response(domain + "/robots.txt") # We need to check if the returned file is actually a txt file, because some sites route back to the index page if response and response.status_code is 200 and "text/plain" in response.headers[ 'content-type']: return response.text return None def fetch_sitemap(self, sitemap_url): response = self.fetch_response(sitemap_url) if response and response.status_code is 200: # Sitemap found return response.text return None """ This function parses the robots.txt from memory using the modified robotparser class The self.robots_parser includes functions to check if the parser is allowed to parse a certain site """ def parse_robots(self, robots_text): self.robots_parser = RobotFileParser(robots_text) self.robots_parser.read() """ https://stackoverflow.com/questions/31276001/parse-xml-sitemap-with-python This only works for the standard XML sitemap """ def parse_sitemap(self, sitemap_xml): try: soup = BeautifulSoup(sitemap_xml, 'lxml') sitemap_tags = soup.find_all("loc") if sitemap_tags is None: return for sitemap_tag in sitemap_tags: url = self.get_parsed_url(sitemap_tag.text) if url: self.add_page_to_frontier_array(url) except Exception as error: print(error) """ Checks if robots are set for the current site and if they allow the crawling of the current page """ def allowed_to_crawl_current_page(self, url): if self.robots_parser is not None: return self.robots_parser.can_fetch('*', url) return True """ Checks if crawl-delay property is set and if it exists check if the required time has elapsed """ def wait_for_crawl_delay_to_elapse(self): try: if self.robots_parser is not None: crawl_delay = self.robots_parser.crawl_delay('*') if crawl_delay is not None: if "last_crawled_at" in self.site and self.site[ "last_crawled_at"] is not None: site_last_crawled_at = self.site["last_crawled_at"] can_crawl_again_at = site_last_crawled_at + timedelta( seconds=crawl_delay) current_time = datetime.now() time_difference = (can_crawl_again_at - current_time).total_seconds() if time_difference > 0: #print(" [CRAWLING] Crawl delay has not yet elapsed for site: {}".format( # self.site["domain"])) time.sleep(crawl_delay) except Exception as error: print(" [CRAWLING] Error while handling crawl delay", error) """ Use the chrome driver to fetch all links and image sources in the rendered page (the driver already returns absolute urls) Note: Sometimes throws StaleElementReferenceException, need to check what that's about. The exception itself just means that the desired element is no longer rendered in DOM. Maybe the memory was getting low, since I got the error when I was running 10 crawler processes. """ def parse_page(self, html_content): links = [] images = [] try: browser = self.driver anchor_tags = browser.find_elements_by_tag_name("a") for anchor_tag in anchor_tags: href = anchor_tag.get_attribute("href") url = self.get_parsed_url(href) if url: links.append(url) image_tags = browser.find_elements_by_tag_name("img") for image_tag in image_tags: src = image_tag.get_attribute("src") if src: image_url = self.get_parsed_image_url(src) if image_url: images.append(image_url) soup = BeautifulSoup(html_content, 'html.parser') script_tags = soup.findAll('script') for script_tag in script_tags: links_from_javascript = self.parse_links_from_javacript( script_tag.text) for link in links_from_javascript: links.append(self.get_parsed_url(link)) return {"links": links, "images": images} except Exception as error: print("[ERROR WHILE RENDERING WITH WEB DRIVER]", error) return {"links": links, "images": images} """ Find all the hrefs that are set in javascript code (window.location changes) """ def parse_links_from_javacript(self, javascript_text): links = [] try: links = re.findall( r'(http://|https://)([\w_-]+(?:(?:\.[\w_-]+)+))([\w.,@?^=%&:/~+#-]*[\w@?^=%&/~+#-])?', javascript_text) if not links: return [] links = [''.join(link) for link in links] except Exception as error: print(" [CRAWLING] Error while parsing links from Javascript", error) return links """ Create a parsed url (ignore javascript and html actions, remove hashes, fix relative urls etc.) """ # TODO: remove index.html index.php def get_parsed_url(self, url): if url is None or url is "": return None domain = self.site["domain"] if not url.startswith("http"): # Since the chrome driver returns absolute urls, the url is most likely javascript or action if 'javascript:' in url: # This is just javascript code inside a href return None if ('mailto:' in url) or ('tel:' in url): # This is an action inside a href return None if url[0] is "#": # Link starts with a # (it's a target link) return None if url is "/": # This is the index page, which we already have in the frontier return None if url.startswith("www"): url = "http://{}".format(url).strip() """ Fix relative urls just in case This function might not work correctly since it's almost impossible to know which root url the link takes when it's added to the site """ if url[0] == "/": if domain[-1] == "/": # Make sure only one slash is present url = url[1:] else: if domain[-1] != "/": url = "/{}".format(url) url = "{}{}".format(domain, url).strip() # Remove everything after the hash if "#" in url: url = url.split("#")[0] # Encode special characters (the second parameter are characters that the encoder will not encode) url = quote(url.encode("UTF-8"), ':/-_.~&?+=') return url """ Parse image urls """ def get_parsed_image_url(self, url): if url is None or url is "": return None # Do not parse base64 images if url.startswith("data:image"): return None if not url.startswith("http"): # This is very unlikely, since the chrome driver returns all the image sources with absolute urls domain = self.site["domain"] """ Fix relative urls just in case This function might not work correctly since it's almost impossible to know which root url the link takes when it's added to the site """ if url[0] == "/": if domain[-1] == "/": # Make sure only one slash is present url = url[1:] else: if domain[-1] != "/": url = "/{}".format(url) # Create an absolute url url = "{}{}".format(domain, url).strip() return url """ The duplicate page should not have the html_content value set, page_type_code should be DUPLICATE and that's it """ def is_duplicate_page(self, html_content): # sha256 digest of complete html_content h = hash_driver.create_content_hash(html_content) # first check if page is exact copy of already parsed documents if database_handler.find_page_duplicate(h): return True else: # create set of hash shingles # in order to prevent pages using lots of same tags to be treated as similar, remove html tags hash_set = hash_driver.text_to_shingle_set( self.remove_markups(html_content)) # hash signature will be inserted to db later self.current_page["hash_signature"] = hash_set # calculate similarity between current document and already parsed documents using Jaccard similarity similarity = database_handler.calculate_biggest_similarity( hash_set) #print("SIMILARITY: ", similarity) return similarity > MAX_SIMILARITY """ Remove markup tags from html content """ def remove_markups(self, html_content): return BeautifulSoup(html_content, "html.parser").text def add_page_to_frontier_array(self, page_url): page_domain = self.get_domain_url(page_url) if ALLOWED_DOMAIN in page_domain: # Only add pages in the allowed domain self.pages_to_add_to_frontier.append({ "from": self.current_page["id"], "to": page_url }) def quit(self): self.driver.quit()
class Crawler(): # Variables parserobots = False output = None report = False config = None domain = "" exclude = [] skipext = [] drop = [] debug = False tocrawl = set([]) crawled = set([]) excluded = set([]) marked = {} # TODO also search for window.location={.*?} linkregex = re.compile(b'<a href=[\'|"](.*?)[\'"].*?>') rp = None response_code={} nb_url=1 # Number of url. nb_rp=0 # Number of url blocked by the robots.txt nb_exclude=0 # Number of url excluded by extension or word output_file = None target_domain = "" def __init__(self, parserobots=False, output=None, report=False ,domain="", exclude=[], skipext=[], drop=[], debug=False): self.parserobots = parserobots self.output = output self.report = report self.domain = domain self.exclude = exclude self.skipext = skipext self.drop = drop self.debug = debug if self.debug: logging.basicConfig(level=logging.DEBUG) self.tocrawl = set([domain]) try: self.target_domain = urlparse.urlparse(domain)[1] except: raise ValueError("Invalid domain") if self.output: try: self.output_file = open(self.output, 'w') except: logging.debug ("Output file not available.") exit(255) def run(self): print (config.xml_header, file if file else self.output_file) logging.debug("Start the crawling process") while len(self.tocrawl) != 0: self.__crawling() logging.debug("Crawling as reach the end of all found link") print (config.xml_footer, file if file else self.output_file) def __crawling(self): crawling = self.tocrawl.pop() url = urlparse.urlparse(crawling) self.crawled.add(crawling) request = Request(crawling, headers={"User-Agent":config.crawler_user_agent}) try: response = urlopen(request) except Exception as e: if hasattr(e,'code'): if e.code in self.response_code: self.response_code[e.code]+=1 else: self.response_code[e.code]=1 # Gestion des urls marked pour le reporting if self.report: if e.code in self.marked: self.marked[e.code].append(crawling) else: self.marked[e.code] = [crawling] logging.debug ("{1} ==> {0}".format(e, crawling)) return self.__continue_crawling() # Read the response try: msg = response.read() if response.getcode() in self.response_code: self.response_code[response.getcode()]+=1 else: self.response_code[response.getcode()]=1 response.close() # Get the last modify date if 'last-modified' in response.headers: date = response.headers['Last-Modified'] else: date = response.headers['Date'] date = datetime.strptime(date, '%a, %d %b %Y %H:%M:%S %Z') except Exception as e: logging.debug ("{1} ===> {0}".format(e, crawling)) return None print ("<url><loc>"+url.geturl()+"</loc><lastmod>"+date.strftime('%Y-%m-%dT%H:%M:%S+00:00')+"</lastmod></url>", file if file else self.output_file) if self.output_file: self.output_file.flush() # Found links links = self.linkregex.findall(msg) for link in links: link = link.decode("utf-8") #logging.debug("Found : {0}".format(link)) if link.startswith('/'): link = 'http://' + url[1] + link elif link.startswith('#'): link = 'http://' + url[1] + url[2] + link elif not link.startswith('http'): link = 'http://' + url[1] + '/' + link # Remove the anchor part if needed if "#" in link: link = link[:link.index('#')] # Drop attributes if needed for toDrop in self.drop: link=re.sub(toDrop,'',link) # Parse the url to get domain and file extension parsed_link = urlparse.urlparse(link) domain_link = parsed_link.netloc target_extension = os.path.splitext(parsed_link.path)[1][1:] if (link in self.crawled): continue if (link in self.tocrawl): continue if (link in self.excluded): continue if (domain_link != self.target_domain): continue if ("javascript" in link): continue # Count one more URL self.nb_url+=1 # Check if the navigation is allowed by the robots.txt if (not self.can_fetch(link)): self.exclude_link(link) self.nb_rp+=1 continue # Check if the current file extension is allowed or not. if (target_extension in self.skipext): self.exclude_link(link) self.nb_exclude+=1 continue # Check if the current url doesn't contain an excluded word if (not self.exclude_url(link)): self.exclude_link(link) self.nb_exclude+=1 continue self.tocrawl.add(link) return None def __continue_crawling(self): if self.tocrawl: self.__crawling() def exclude_link(self,link): if link not in self.excluded: self.excluded.add(link) def checkRobots(self): if self.domain[len(self.domain)-1] != "/": self.domain += "/" request = Request(self.domain+"robots.txt", headers={"User-Agent":config.crawler_user_agent}) self.rp = RobotFileParser() self.rp.set_url(self.domain+"robots.txt") self.rp.read() def can_fetch(self, link): try: if self.parserobots: if self.rp.can_fetch("*", link): return True else: logging.debug ("Crawling of {0} disabled by robots.txt".format(link)) return False if not self.parserobots: return True return True except: # On error continue! logging.debug ("Error during parsing robots.txt") return True def exclude_url(self, link): for ex in self.exclude: if ex in link: return False return True def make_report(self): print ("Number of found URL : {0}".format(self.nb_url)) print ("Number of link crawled : {0}".format(len(self.crawled))) if self.parserobots: print ("Number of link block by robots.txt : {0}".format(self.nb_rp)) if self.skipext or self.exclude: print ("Number of link exclude : {0}".format(self.nb_exclude)) for code in self.response_code: print ("Nb Code HTTP {0} : {1}".format(code, self.response_code[code])) for code in self.marked: print ("Link with status {0}:".format(code)) for uri in self.marked[code]: print ("\t- {0}".format(uri))
class Webpage(object): """ Objects that refer to individual webpages. If the url is scrapeable the object will be filled with that data, indexed, and inserted into a database to be searched. """ number_of_scraped_pages = 0 def __init__(self, url): """ Creates a webpage object and assigns it the provided url. """ self.url = url if self.url not in black_list and self.url not in scraped_urls: self.needs_to_be_scraped = True else: self.needs_to_be_scraped = False def page_robot_scannable(self): """ Checks whether the page is allowed to be crawled """ if self.need_to_be_scraped is True: # REFACTOR to remove try statement. try: headers = {'User-agent':settings.SPIDER_USER_AGENT} self.urlparse = urlparse.urlparse(self.url) self.robotcheck = RobotFileParser() self.robotcheck.set_url('http://'+self.urlparse[1]+'/robots.txt') # Only works with http right now. self.robotcheck.read() self.need_to_be_scraped = self.robotcheck.can_fetch(settings.SPIDER_USER_AGENT, self.url) except: self.need_to_be_scraped = False def get_page(self): """ The url is requested with a GET request. The page html is scraped directly, while elements of it aee scraped in parse_page """ self.headers = {'User-agent':settings.SPIDER_USER_AGENT} #REFACTOR to remove try try: self.request = requests.get(self.url, headers=headers) self.pagehtml = BeautifulSoup(self.request.text) #REFACTOR, don't use BeautifulSoup self.count = self.instanceID.next() Webpage.number_of_scraped_pages += 1 except: raise Exception def get_visible_elements(self, element): """ Checks that the element is not contained in <style>, <script>, <head>, <title> or [document]. It also cannot be commented out. """ if element.parent.name in ['style', 'script', '[document]', 'head', 'title']: return False elif re.match('<!--.*-->', str(element)): return False return True def parse_page(self): """ This method parses the HTML page and extracts the title of the page, the outgoing links, the number of outgoing links, and the text. """ self.title = self.pagehtml.find('title').text self.page_text = self.pagehtml.findAll(text=true) for item in filter(get_visible_elements, self.pagetext): if item != '\n': self.pagetext+= item self.pagelinks = {} for link in soup.findAll('a'): self.pagelinks[link.get('href')] = 1 for link in self.pagehtml: pass # determine if link is relative or absolute. if relative, change it to absolute def inverted_index_page_text(self): """ Iterates through the words in the page text and creates and adds them to an index. """ self.pagetextlist = self.pagetext.split(' ') #Noted error: This catches punctuation along with words. for index, word in enumerate(self.pagetextlist): if word not in STOP_WORDS: if not inverted_index.get(word): inverted_index[word]={'url':self.url,'offsets':[index]} else: inverted_index[word]['offsets'].append(index) def set_page_scraped(self): """ Once the page is scraped it is flagged as such """ self.needs_to_be_scraped = False
class spider(object): CurLink = "" linkURI = [] texts = [] Meta = {} def __init__(self, link): self.CurLink = link self.r = RobotFileParser() def crawl(self): self.r.set_url(urlparse.unquote(self.CurLink)) self.r.read() self.html = urlopen(self.CurLink).read() self.bs = BeautifulSoup(self.html, "lxml") for script in self.bs(["script", "style"]): script.extract() text = self.bs.get_text() lines = (line.strip() for line in text.splitlines()) chunks = (phrase.strip() for line in lines for phrase in line.split(" ")) for chunk in chunks: if chunk: self.texts.append(chunk) # site = urlparse.urlsplit(self.CurLink).scheme + "://" + urlparse.urlsplit(self.CurLink).netloc + "/sitemap.aspx" # r = requests.get(site) if requests.get( urlparse.urlsplit(self.CurLink).scheme + "://" + urlparse.urlsplit(self.CurLink).netloc + "/sitemap.aspx").ok == True: root = etree.fromstring( requests.get( urlparse.urlsplit(self.CurLink).scheme + "://" + urlparse.urlsplit(self.CurLink).netloc + "/sitemap.xml").content) for sitemap in root: children = sitemap.getchildren() self.linkURI.append(children[0].text) elif requests.get( urlparse.urlsplit(self.CurLink).scheme + "://" + urlparse.urlsplit(self.CurLink).netloc + "/sitemap.xml").ok == True: root = etree.fromstring( requests.get( urlparse.urlsplit(self.CurLink).scheme + "://" + urlparse.urlsplit(self.CurLink).netloc + "/sitemap.xml").content) for sitemap in root: children = sitemap.getchildren() self.linkURI.append(children[0].text) else: for link in self.bs.findAll('a', href=True): aLink = urlparse.urljoin(self.CurLink, link['href']) if (self.r.can_fetch("*", aLink)): self.linkURI.append(aLink) page = metadata_parser.MetadataParser(url=self.CurLink) meta = page.metadata keyw = "null" descr = "null" if (meta.get('meta').get('Keywords')): keyw = meta['meta']['Keywords'].split(', ') if (meta.get('meta').get('Description')): descr = meta['meta']['Description'] self.Meta = { 'title': meta['page']['title'], 'url': meta['_internal']['url_actual'], 'description': descr, 'keyword': keyw }
class SiteMap(): def __init__(self, main_page=None, robotrules=True): """ Constuctor method that initializes the members that are used during crawling process :param main_page: The root page that needs to be crawled for generation of sitemap """ logging.info("Consider Robot.txt ? ==> " + str(robotrules)) self.robotrules = robotrules self.site_map = { } # map that records the visits of urls, datemodified and assets self.network = { } # map that maintains the network/graph of webpages visited # The intention of this map is for visual rendering using d3.js self.unvisited = set( []) # a set to keep the list of urls yet to be visited self.start_page = None # the root page, this is used to avoid cycle and keeping crawl # process limited to single domain. self.robot_txt_rules = None if main_page: self.unvisited.add(main_page) try: self.start_page = urlparse(main_page).netloc except: logging.error("Improper URL, Please provide a Valid Url:" + main_page) exit(0) if self.robotrules == "True": try: logging.info("robot.txt respected") self.robot_txt_rules = RobotFileParser() self.robot_txt_rules.set_url(main_page + "/robots.txt") self.robot_txt_rules.read() except: logging.error("Unable to read the robot.txt file") self.robotrules = False # error reading robot.txt, ignore it forever @timeit def generate(self, site_map=None): """ This method holds the invoking control of the crawler method and drives the crawling process. Basically a BFS style method that keeps popping the elements from the queue [self.unvisited set] and scraping the urls. Once the crawling process is done, this creates sitemap using the self.site_map dictionary with just url, date-modified tags with dummy frequency and priorities. :param site_map: name of the site_map file so as to create xml entries. :return: """ while self.unvisited: self.crawl() # create xml from the site_map dictionary header = """<?xml version="1.0" encoding="UTF-8"?> <urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9" xmlns:xhtml="http://www.w3.org/1999/xhtml" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://www.sitemaps.org/schemas/sitemap/0.9 http://www.sitemaps.org/schemas/sitemap/0.9/sitemap.xsd"> """ footer = """\n</urlset>\n""" entry = "\t<url>\n\ \t\t<loc>%s</loc>\n\ \t\t<lastmod>%s</lastmod>\n\ \t\t<changefreq>monthly</changefreq>\n\ \t\t<priority> 1 </priority>\n\ \t</url>\ " xml = header for url in self.site_map.keys(): xml += entry % (url, self.site_map[url]['date']) + "\n" xml += footer if site_map != None: self.write_to_file(site_map, xml) else: self.write_to_file("sitemap.xml", xml) return xml def write_to_file(self, file_name, content): """ A utility method to just write the contents of the file into a given file name. Alert: This overwrites if the file does exist in the current directory. :param file_name: name of the file, sitemap in our case. :param content: contents of the file :return: None """ f = open(file_name, 'w') f.write(content) f.close() def compose_url_from_href(self, url, href): """ There are different ways a href could specify a location and it varies in different ways based on how the page is designed. This method takes few styles into consideration and ignores some, cleans and creates a valid url link so as to keep it ready for the crawl method. :param url: basae url of the current page :param href: one of the hyper links of the page :return: a well formed and valid http link """ if href.startswith('/'): return "http://%s%s" % (url.netloc, href) elif href.startswith('#'): return "http://%s%s%s" % (url.netloc, url.path, href) elif href.startswith('./'): return "http://%s%s" % (url.netloc, href[1:]) elif not href.startswith('http'): return "http://" + url.netloc + '/' + href return href def get_out_going_edges(self, url, html_body): """ This method encompasses the BFS along with the coupling with crawl and generator as it changes the state of the unvisited map. Basically this method extracts the links that belong to the same domain as the start page, cleans them with compose_url_from_href method and updates the map. This also avoids unnecessary traps like href links pointing to 'javascript', 'mailto' etc. :param url: current page url :param html_body: current page's html content :return: returns all the valid and wellformed out going links from this page """ soup = BeautifulSoup(html_body, "html.parser") valid_links_for_this_page = [] for a in soup.find_all('a', href=True): href = a['href'] href = self.compose_url_from_href(url, href.decode("utf-8")) # clean the href so that it will have legitimate urls instead of #cluttered ones and q=param prints href = urldefrag(href)[ 0] # skip intra links [this took time to find out !] ##1 # remove query params as only the path matters if href.find('?') != -1: href = href[:href.find('?')] ##2 new_page = urlparse(href) # add to the queue only it it doesn't cause a cycle # assumption: if a link ends with domain.com, assuming it can be crawled to make sitemap complete if not str(new_page.netloc).endswith( self.start_page): # doesn't belong to domain continue if self.robot_allows(href) and \ not href in self.site_map.keys() and \ not href in self.unvisited and \ not 'javascript:' in href and \ not 'mailto:' in href: self.unvisited.add(href) valid_links_for_this_page.append(href) return valid_links_for_this_page def record_visit(self, url, headers, html_body): """ Any time a specific url of a site is changed, its last-modified date and time are kept in the page headers. This info helps bots and crawlers to not to crawl the page if it has not been updated since last crawl. This method is used to preserve the url crawled and its last-modified time along with assets scraped into the container dictionary for later usage to generate sitemap and visualization network. :param url: url of the just finished crawling page :param headers: header information of the crawled page :param html_body: html content of the page :return: None """ if 'last-modified' in headers: date = headers['Last-Modified'] else: date = headers['Date'] self.site_map[url] = { 'date': date, 'assets': self.get_static_assets(html_body) } def get_static_assets(self, html_body): """ A html page could contain other links such as .css, .img. .mp4 and .js. All these files are not dynamic though they could produce dynamic results. The code or text that exists in these files is constant and static. These files are referred as static assets and for the definition of this challenge, I have chosen to keep all the info in a single dictionary and extract them at the end for reports, results and stats. :param html_body: html content of the page. :return: returns a dictionary that encompasses .css, .img, ijs files as lists. """ # add static assets of the page .css, .js and image urls may be ? soup = BeautifulSoup(html_body, "html.parser") img = soup.findAll("img") css = soup.findAll("link", {"rel": "stylesheet"}) # js is tricky: I faced an issue with inline javascript and ignoring it for the time being. # an extract like html_body with just needed parts is a must for excluding inline scripts and styles. jss = [] for x in soup.findAll('script'): try: list.append(x['src']) except KeyError: pass csss = [] imgs = [] jss = [] for link in css: csss.append(link['href']) for link in img: imgs.append(link['src']) for link in jss: jss.append(link['src']) return {'css': csss, 'img': imgs, 'js': jss} def crawl(self): """ The main driver method that crawls the pages. This main does below steps: for every unvisited [vertex|page] that belongs to the requested domain: crawl the page record valid links and their last-modified-dates :return: None """ page = self.unvisited.pop() # if robot.txt is defined, use Disallow to avoid pages. domain.robot.txt doesn't exist so the crawler # must find all the pages for report. logging.info("Starting to Crawl Page: " + page) url = urlparse(page) try: response = urlopen(page) except: logging.debug("Issue with the url: " + page) return None try: html_body = response.read() # response.getcode() response.close() # record visit ans assets self.record_visit(page, response.headers, html_body) logging.debug("Queued Pages: {0}, Crawled Pages: {1}".format( len(self.unvisited), len(self.site_map))) except: logging.debug("Issue while opening url: " + page) return None connects = self.get_out_going_edges(url, html_body) # simple Graph that keeps the order of the pages crawled. for i, url in enumerate(connects): self.network[page] = { 'to': connects, 'assets': { 'css': self.site_map[page]['assets']['css'], 'js': self.site_map[page]['assets']['js'], 'img': self.site_map[page]['assets']['img'] } } return None def get_site_map(self): """ Returns the compiled sitemap structure :return: sitemap data structure """ return self.site_map def get_network_graph(self): """ Returns the compiled network in the order of the crawled pages :return: network graph """ return self.network def get_network_json_format(self): """ Returns the crawl traverse order sequence in json format :return: network in json format """ return json.dumps(self.network) def set_start_page(self, url): """ This could be useful if one is testing :param url: start page to start the crawling. :return: """ self.start_page = url def robot_allows(self, link): if not self.robotrules: return True try: if self.robot_txt_rules.can_fetch("*", link): return True return False except: return True
def __init__(self, url=''): RobotFileParser.__init__(self, url)
class MarioDepth: def __init__(self, starturl, callback, callpre=None, callfail=None, concount=MAXCONCOUNT, depth=2, accept_url_patterns=None, reject_url_patterns=None): self.concount = concount self.callback = callback self.callpre = callpre self.callfail = callfail self.depth = depth self.starturl = starturl self.baseurl = URL.baseurl(starturl) self.urls = [] self.crawled = {} self.link_title_db = LinkTitleDB() self.accept_url_patterns = accept_url_patterns self.reject_url_patterns = reject_url_patterns self.robotstxt = RobotFileParser() self.robotstxt.set_url(urljoin(starturl, '/robots.txt')) self.referer = starturl try: self.robotstxt.read() except: logger.debug(Traceback()) #self.lightcloud = LightCloud.connect('n0') def __call__(self, n=None): if n: self.concount = n current_depth = self.depth self.urls.append((self.starturl, current_depth)) while self.urls: self.depth_get() logger.debug('%d unprocessed urls'%(len(self.urls))) def depth_get(self): mario = MarioBatch(callback=self.next_depth, callpre=self.callpre, callfail=self.callfail) pool = coros.CoroutinePool(max_size=len(self.urls)) while self.urls: waiters = [] #self.add_job(mario) counter = 0 while self.urls: if counter > 9: break; counter += 1 waiters.append(pool.execute(self.add_job, mario)) logger.debug('Depth break') for waiter in waiters: waiter.wait() mario(self.concount) def add_job(self, mario): if not self.urls: return url, depth = self.urls.pop() if self.visited(url, depth): return mario.add_job(url, args=depth) def visited(self, url, depth): #is_duplicate = URL.is_duplicate(url, self.lightcloud) return depth==0 and is_duplicate or depth < self.depth and self.crawled.has_key(url) and self.crawled[url] == 2 def next_depth(self, response): #with_timeout(1, self.lightcloud.set, LightCloud.crawled_url_key(response.effective_url), response.url, timeout_value=None) for link, title in URL.link_title(response.body, response.effective_url): if not self.inject_url(link, response.args):continue self.link_title_db.add(link, response.effective_url, title) if callable(self.callback): self.callback(response) self.crawled[response.effective_url] = 2 if response.effective_url != response.url: self.crawled[response.url] = 2 self.referer = response.effective_url def inject_url(self, url, depth): if not (depth and url and url not in self.crawled): #logger.debug('IGNORE(%d): %r'%(depth, url)) return None if isinstance(url, unicode): url = url.encode('utf-8') if self.reject_url(url): logger.debug('REJECT: %r' % url) return None try: can_fetch = self.robotstxt.can_fetch(USER_AGENT['safari'], url) except: can_fetch = True if self.baseurl!='http://hi.baidu.com/' and not can_fetch: logger.debug('DISALLOW: %r' % url) return None logger.debug('INJECT(%d): %r' % (depth-1, url)) self.crawled[url] = 1 self.urls.append((url, depth-1)) return True def reject_url(self, url): return self.baseurl != URL.baseurl(url) and (not self.accept_url_patterns or not re.match('|'.join(self.accept_url_patterns), url) or self.reject_url_patterns or re.match('|'.join(self.reject_url_patterns), url))
def __init__(self, link): self.CurLink = link self.r = RobotFileParser()
class SimpleCrawler: USER_AGENT = 'SimpleCrawler/0.1' HEADERS = { 'User-Agent': USER_AGENT, 'Accept-Encoding': 'gzip', 'Connection': 'keep-alive' } CONTENT_TYPE_PAT = re.compile(r'([^\s;]+)(.*charset=([^\s;]+))?', re.I) def __init__(self, starturl, index_html='', maxlevel=1, cookie_file=None, acldb=None, urldb=None, default_charset=None, delay=0, timeout=300, debug=0): (proto, self.hostport, _x, _y, _z) = urlsplit(starturl) # assert proto == 'http' #Thread.__init__(self) self.debug = debug self.index_html = index_html if cookie_file: self.cookiejar = MozillaCookieJar(cookie_file) self.cookiejar.load() else: self.cookiejar = None self.robotstxt = RobotFileParser() self.robotstxt.set_url(urljoin(starturl, '/robots.txt')) # self.robotstxt.read() self.conn = None self.urldb = urldb self.acldb = acldb self.curlevel = 0 self.delay = delay self.timeout = timeout self.default_charset = default_charset if starturl.endswith('/'): starturl += self.index_html self.urls = [(starturl, maxlevel)] self.crawled = {} # 1:injected, 2:crawled return def accept_url(self, url): if url.endswith('/'): url += self.index_html if self.acldb and not self.acldb.allowed(url): return None return url def inject_url(self, url): if (not self.curlevel) or (not url) or (url in self.crawled): return False if not self.robotstxt.can_fetch(self.USER_AGENT, url): if self.debug: print >> stderr, 'DISALLOW: %r' % url return None if self.debug: print >> stderr, 'INJECT: %r' % url self.crawled[url] = 1 self.urls.append((url, self.curlevel - 1)) return True def get1(self, url, maxretry=5, maxredirect=5): if self.debug: print >> stderr, 'GET: %r' % url # loop for rtry in range(maxredirect): # forge urllib2.Request object. req = Request(url) # add cookie headers if necessary. if self.cookiejar: self.cookiejar.add_cookie_header(req) headers = req.unredirected_hdrs headers.update(self.HEADERS) else: headers = self.HEADERS # get response. for ctry in range(maxretry): try: if not self.conn: print >> stderr, 'Making connection: %r...' % ( self.hostport, ) self.conn = HTTPConnection(self.hostport) self.conn.request('GET', req.get_selector().replace(' ', ''), '', headers) # self.conn.sock.settimeout(self.timeout) resp = self.conn.getresponse() break except BadStatusLine, x: # connection closed unexpectedly print >> stderr, 'Connection closed unexpectedly.' # it restarts the connection... self.conn.close() self.conn = None except socket.error, x: # connection closed unexpectedly print >> stderr, 'Socket error:', x self.conn.close() self.conn = None else:
class WebPage(object): def __init__(self, url): self.page_url = url self.parsed_url = urlparse.urlparse(url) self.lang = "" self.isDownload = False self.title = "" self.text = "" self.soup = None self.robot = RobotFileParser() def __normalize_link__(self, link): if not link: return None if link.startswith('//'): return self.parsed_url.scheme + ':' + link elif link.startswith('/'): return self.parsed_url.scheme + '://' + self.parsed_url.hostname + link elif link.startswith('http://') or link.startswith('https://'): return link elif link.startswith("irc://"): return None elif link.startswith('#') or link.startswith('javascript:'): return None else: return urlparse.urljoin(self.page_url, link) def __delete_unnecessary_tags(self): if self.soup is None: return if self.soup.title is None: self.title = "" else: self.title = self.soup.title.string for tag in self.soup( ['style', 'script', '[document]', 'head', 'title']): tag.decompose() def __get_stems(self, text): if self.lang in LANGUAGES: stemer = snowballstemmer.stemmer(LANGUAGES[self.lang]) else: raise NotImplementedError("That lang not implemented") stems_dict = dict() for char in [",", ". ", "!", "?", " - ", "/n"]: text = text.replace(char, " ") for word in text.split(): stem_word = stemer.stemWord(word.lower()) if stem_word in stems_dict: stems_dict[stem_word] += 1 else: stems_dict[stem_word] = 1 return stems_dict def download_page(self): try: self.robot.set_url("{0}://{1}/robots.txt".format( self.parsed_url.scheme, self.parsed_url.hostname)) self.robot.read() if self.robot.can_fetch("*", self.page_url): response = requests.get(self.page_url, verify=False) else: return False except requests.exceptions.InvalidSchema: return False except KeyError: return False except Exception: return False if response.status_code == 200: self.soup = BeautifulSoup(response.text, "html.parser") self.__delete_unnecessary_tags() self.text = "".join(self.soup.strings) try: self.lang = detect(self.text) except Exception: self.lang = "en" self.isDownload = True return True else: return False def get_links(self): if not self.isDownload: raise Exception("You should download page") def get_links_generator(): for link in self.soup.find_all("a"): normalized_link = self.__normalize_link__(link.get("href")) if normalized_link is None: continue else: yield normalized_link return get_links_generator() def get_text_stems(self): if not self.isDownload: raise Exception("You should download page") return self.__get_stems(self.text) def get_title_stems(self): if not self.isDownload: raise Exception("You should download page") return self.__get_stems(self.title) def get_domain(self): return self.parsed_url.hostname
class SimpleCrawler: USER_AGENT = 'SimpleCrawler/0.1' HEADERS = { 'User-Agent': USER_AGENT, 'Accept-Encoding': 'gzip', 'Connection': 'keep-alive' } CONTENT_TYPE_PAT = re.compile(r'([^\s;]+)(.*charset=([^\s;]+))?', re.I) def __init__(self, starturl, index_html='', maxlevel=1, cookie_file=None, acldb=None, urldb=None, default_charset=None, delay=0, timeout=300, debug=0): (proto, self.hostport, _x, _y, _z) = urlsplit(starturl) assert proto == 'http' #Thread.__init__(self) self.debug = debug self.index_html = index_html if cookie_file: self.cookiejar = MozillaCookieJar(cookie_file) self.cookiejar.load() else: self.cookiejar = None self.robotstxt = RobotFileParser() self.robotstxt.set_url(urljoin(starturl, '/robots.txt')) self.robotstxt.read() self.conn = None self.urldb = urldb self.acldb = acldb self.curlevel = 0 self.delay = delay self.timeout = timeout self.default_charset = default_charset if starturl.endswith('/'): starturl += self.index_html self.urls = [(starturl, maxlevel)] self.crawled = {} # 1:injected, 2:crawled return def accept_url(self, url): if url.endswith('/'): url += self.index_html if self.acldb and not self.acldb.allowed(url): return None return url def inject_url(self, url): if (not self.curlevel) or (not url) or (url in self.crawled): return False if not self.robotstxt.can_fetch(self.USER_AGENT, url): if self.debug: print >>stderr, 'DISALLOW: %r' % url return None if self.debug: print >>stderr, 'INJECT: %r' % url self.crawled[url] = 1 self.urls.append((url, self.curlevel-1)) return True def get1(self, url, maxretry=3, maxredirect=3): if self.debug: print >>stderr, 'GET: %r' % url # loop for rtry in range(maxredirect): # forge urllib2.Request object. req = Request(url) # add cookie headers if necessary. if self.cookiejar: self.cookiejar.add_cookie_header(req) headers = req.unredirected_hdrs headers.update(self.HEADERS) else: headers = self.HEADERS # get response. for ctry in range(maxretry): try: if not self.conn: print >>stderr, 'Making connection: %r...' % (self.hostport,) self.conn = HTTPConnection(self.hostport) self.conn.request('GET', req.get_selector().replace(' ',''), '', headers) self.conn.sock.settimeout(self.timeout) resp = self.conn.getresponse() break except BadStatusLine, x: # connection closed unexpectedly print >>stderr, 'Connection closed unexpectedly.' # it restarts the connection... self.conn.close() self.conn = None except socket.error, x: # connection closed unexpectedly print >>stderr, 'Socket error:', x self.conn.close() self.conn = None else:
def knock(self, user_agent, url, override, retries=0, debug_force_status=None): """ Makes a request for '/robots.txt' and returns True if 'user_agent' can fetch 'url'. Returns False otherwise If we get a HTTP response code other than '200' or any request error occurs, this function will return True If we get a gaierror (DNS lookup error), this function will return False as everything else is doomed to fail If 'override' is True, this function will automatically return True. Default value for override is False """ if override: return True host = net.urlparse(url)[1] robot = RobotFileParser() clearance = False if retries > 0: time_mod.sleep(self.crawl_delay) try: # We try to get the resource /robots.txt connection = net.HTTPConnection(host, 80) connection.request( self.GET, "/robots.txt", None, { "User-Agent" : user_agent } ) response = connection.getresponse() robot_lines = response.read().splitlines() connection.close() if debug_force_status: response.status = debug_force_status if response.status == 200 and filter(None, robot_lines) != []: # If everthing went well, we feed the content of the resource to the parser robot.parse(robot_lines) # And resolve if we have clearance to fetch the url clearance = robot.can_fetch(user_agent, url) # We try to get the Crawl-delay directive, if it exists try: self.crawl_delay = int( "".join(list( directive for directive in robot_lines if directive.lower().startswith("crawl-delay") )).split(":")[1] ) except IndexError: # If no 'Crawl-delay' is specified, we leave it at 1 second pass elif response.status in [408, 500, 503]: if retries < 3: try: time_mod.sleep(self.current_headers["retry-after"] - self.crawl_delay) except KeyError: pass except TypeError: pass clearance = self.knock(user_agent, url, False, retries + 1) else: clearance = True else: clearance = True if retries < 1: time_mod.sleep(self.crawl_delay) return clearance except net.HTTPException: # A request error occurred. We retry the request, if it fails we just ignore /robots.txt and proceed if retries < 3: return self.knock(user_agent, url, False, retries + 1) else: return True except net.timeout: # Request timed out. We retry the request, if it fails we just ignore /robots.txt and proceed if retries < 3: return self.knock(user_agent, url, False, retries + 1) else: return True
class SiteMap(): def __init__(self, main_page=None, robotrules=True): """ Constuctor method that initializes the members that are used during crawling process :param main_page: The root page that needs to be crawled for generation of sitemap """ logging.info("Consider Robot.txt ? ==> "+str(robotrules)) self.robotrules = robotrules self.site_map = {} # map that records the visits of urls, datemodified and assets self.network = {} # map that maintains the network/graph of webpages visited # The intention of this map is for visual rendering using d3.js self.unvisited = set([]) # a set to keep the list of urls yet to be visited self.start_page = None # the root page, this is used to avoid cycle and keeping crawl # process limited to single domain. self.robot_txt_rules = None if main_page: self.unvisited.add(main_page) try: self.start_page = urlparse(main_page).netloc except: logging.error("Improper URL, Please provide a Valid Url:"+main_page) exit(0) if self.robotrules == "True": try: logging.info("robot.txt respected") self.robot_txt_rules = RobotFileParser() self.robot_txt_rules.set_url(main_page + "/robots.txt") self.robot_txt_rules.read() except: logging.error("Unable to read the robot.txt file") self.robotrules = False # error reading robot.txt, ignore it forever @timeit def generate(self, site_map=None): """ This method holds the invoking control of the crawler method and drives the crawling process. Basically a BFS style method that keeps popping the elements from the queue [self.unvisited set] and scraping the urls. Once the crawling process is done, this creates sitemap using the self.site_map dictionary with just url, date-modified tags with dummy frequency and priorities. :param site_map: name of the site_map file so as to create xml entries. :return: """ while self.unvisited: self.crawl() # create xml from the site_map dictionary header = """<?xml version="1.0" encoding="UTF-8"?> <urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9" xmlns:xhtml="http://www.w3.org/1999/xhtml" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://www.sitemaps.org/schemas/sitemap/0.9 http://www.sitemaps.org/schemas/sitemap/0.9/sitemap.xsd"> """ footer = """\n</urlset>\n""" entry = "\t<url>\n\ \t\t<loc>%s</loc>\n\ \t\t<lastmod>%s</lastmod>\n\ \t\t<changefreq>monthly</changefreq>\n\ \t\t<priority> 1 </priority>\n\ \t</url>\ " xml = header for url in self.site_map.keys(): xml += entry % (url, self.site_map[url]['date']) + "\n" xml += footer if site_map != None: self.write_to_file(site_map, xml) else: self.write_to_file("sitemap.xml", xml) return xml def write_to_file(self, file_name, content): """ A utility method to just write the contents of the file into a given file name. Alert: This overwrites if the file does exist in the current directory. :param file_name: name of the file, sitemap in our case. :param content: contents of the file :return: None """ f = open(file_name, 'w') f.write(content) f.close() def compose_url_from_href(self, url, href): """ There are different ways a href could specify a location and it varies in different ways based on how the page is designed. This method takes few styles into consideration and ignores some, cleans and creates a valid url link so as to keep it ready for the crawl method. :param url: basae url of the current page :param href: one of the hyper links of the page :return: a well formed and valid http link """ if href.startswith('/'): return "http://%s%s"%(url.netloc, href) elif href.startswith('#'): return "http://%s%s%s"%(url.netloc, url.path, href) elif href.startswith('./'): return "http://%s%s"%(url.netloc, href[1:]) elif not href.startswith('http'): return "http://" + url.netloc + '/' + href return href def get_out_going_edges(self, url, html_body): """ This method encompasses the BFS along with the coupling with crawl and generator as it changes the state of the unvisited map. Basically this method extracts the links that belong to the same domain as the start page, cleans them with compose_url_from_href method and updates the map. This also avoids unnecessary traps like href links pointing to 'javascript', 'mailto' etc. :param url: current page url :param html_body: current page's html content :return: returns all the valid and wellformed out going links from this page """ soup = BeautifulSoup(html_body, "html.parser") valid_links_for_this_page = [] for a in soup.find_all('a', href=True): href = a['href'] href = self.compose_url_from_href(url, href.decode("utf-8")) # clean the href so that it will have legitimate urls instead of #cluttered ones and q=param prints href = urldefrag(href)[0] # skip intra links [this took time to find out !] ##1 # remove query params as only the path matters if href.find('?') != -1: href = href[:href.find('?')] ##2 new_page = urlparse(href) # add to the queue only it it doesn't cause a cycle # assumption: if a link ends with domain.com, assuming it can be crawled to make sitemap complete if not str(new_page.netloc).endswith(self.start_page): # doesn't belong to domain continue if self.robot_allows(href) and \ not href in self.site_map.keys() and \ not href in self.unvisited and \ not 'javascript:' in href and \ not 'mailto:' in href: self.unvisited.add(href) valid_links_for_this_page.append(href) return valid_links_for_this_page def record_visit(self, url, headers, html_body): """ Any time a specific url of a site is changed, its last-modified date and time are kept in the page headers. This info helps bots and crawlers to not to crawl the page if it has not been updated since last crawl. This method is used to preserve the url crawled and its last-modified time along with assets scraped into the container dictionary for later usage to generate sitemap and visualization network. :param url: url of the just finished crawling page :param headers: header information of the crawled page :param html_body: html content of the page :return: None """ if 'last-modified' in headers: date = headers['Last-Modified'] else: date = headers['Date'] self.site_map[url] = { 'date': date, 'assets': self.get_static_assets(html_body) } def get_static_assets(self, html_body): """ A html page could contain other links such as .css, .img. .mp4 and .js. All these files are not dynamic though they could produce dynamic results. The code or text that exists in these files is constant and static. These files are referred as static assets and for the definition of this challenge, I have chosen to keep all the info in a single dictionary and extract them at the end for reports, results and stats. :param html_body: html content of the page. :return: returns a dictionary that encompasses .css, .img, ijs files as lists. """ # add static assets of the page .css, .js and image urls may be ? soup = BeautifulSoup(html_body, "html.parser") img = soup.findAll("img") css = soup.findAll("link", {"rel": "stylesheet"}) # js is tricky: I faced an issue with inline javascript and ignoring it for the time being. # an extract like html_body with just needed parts is a must for excluding inline scripts and styles. jss = [] for x in soup.findAll('script'): try: list.append(x['src']) except KeyError: pass csss = [] imgs = [] jss = [] for link in css: csss.append(link['href']) for link in img: imgs.append(link['src']) for link in jss: jss.append(link['src']) return { 'css': csss, 'img': imgs, 'js': jss } def crawl(self): """ The main driver method that crawls the pages. This main does below steps: for every unvisited [vertex|page] that belongs to the requested domain: crawl the page record valid links and their last-modified-dates :return: None """ page = self.unvisited.pop() # if robot.txt is defined, use Disallow to avoid pages. domain.robot.txt doesn't exist so the crawler # must find all the pages for report. logging.info("Starting to Crawl Page: " + page) url = urlparse(page) try: response = urlopen(page) except: logging.debug("Issue with the url: " + page) return None try: html_body = response.read() # response.getcode() response.close() # record visit ans assets self.record_visit(page, response.headers, html_body) logging.debug("Queued Pages: {0}, Crawled Pages: {1}".format(len(self.unvisited), len(self.site_map))) except: logging.debug("Issue while opening url: " + page) return None connects = self.get_out_going_edges(url, html_body) # simple Graph that keeps the order of the pages crawled. for i, url in enumerate(connects): self.network[page] = { 'to': connects, 'assets': { 'css': self.site_map[page]['assets']['css'], 'js': self.site_map[page]['assets']['js'], 'img': self.site_map[page]['assets']['img'] } } return None def get_site_map(self): """ Returns the compiled sitemap structure :return: sitemap data structure """ return self.site_map def get_network_graph(self): """ Returns the compiled network in the order of the crawled pages :return: network graph """ return self.network def get_network_json_format(self): """ Returns the crawl traverse order sequence in json format :return: network in json format """ return json.dumps(self.network) def set_start_page(self, url): """ This could be useful if one is testing :param url: start page to start the crawling. :return: """ self.start_page = url def robot_allows(self, link): if not self.robotrules: return True try: if self.robot_txt_rules.can_fetch("*", link): return True return False except: return True
def _create_robot_file_parser(self, url): host = urlparse.urlsplit(url)[1] robots_url = urlparse.urlunsplit(('http', host, '/robots.txt', '', '')) rp = RobotFileParser(robots_url) rp.read() return rp