def process_url(self, url): Helper.debug("process start") try: source = request.urlopen(url).read() except: return set() Helper.debug("process 1:db") self.db_cache(url, source) #db = sqlite3.connect("data/pages.db") #cursor = db.cursor() #cursor.execute("""SELECT url FROM pages""") #all_urls = [''.join(item) for item in cursor.fetchall()] #if url in all_urls: # cursor.execute(""" # UPDATE pages SET html = ? WHERE url = ? """, (source, url)) #else: # cursor.execute(""" # INSERT INTO pages(url, html) VALUES (?,?)""", (url, source)) #db.commit() #db.close() Helper.debug("process 2:re") # Regex for finding links rgx = re.compile('a href="(\/\S+|[\/aA-zZ0-9]\S+\.\S+)"') linkMatches = rgx.findall(str(source)) tempFrontier = set() tempFrontier.add(url) Helper.debug("process 3:add links") if self.frontier.frontQueue.qsize() < 10: for link in linkMatches: if ('https://' in link or 'http://' in link or link[0] == '/') \ and 'ftp.' not in link \ and'ftp://' not in link \ and 'mailto:' not in link: tempFrontier.add( self.normalize_url(link, Helper.get_domain(url))) #tempFrontier = tempFrontier - set(self.get_disallowed_sites(url, 'GingerWhiskeyCrawler')) Helper.debug("process end") return tempFrontier
def process_url(self, url): Helper.debug("process start") try: source = request.urlopen(url).read() except: return set() Helper.debug("process 1:db") self.db_cache(url, source) #db = sqlite3.connect("data/pages.db") #cursor = db.cursor() #cursor.execute("""SELECT url FROM pages""") #all_urls = [''.join(item) for item in cursor.fetchall()] #if url in all_urls: # cursor.execute(""" # UPDATE pages SET html = ? WHERE url = ? """, (source, url)) #else: # cursor.execute(""" # INSERT INTO pages(url, html) VALUES (?,?)""", (url, source)) #db.commit() #db.close() Helper.debug("process 2:re") # Regex for finding links rgx = re.compile('a href="(\/\S+|[\/aA-zZ0-9]\S+\.\S+)"') linkMatches = rgx.findall(str(source)) tempFrontier = set() tempFrontier.add(url) Helper.debug("process 3:add links") if self.frontier.frontQueue.qsize() < 10: for link in linkMatches: if ('https://' in link or 'http://' in link or link[0] == '/') \ and 'ftp.' not in link \ and'ftp://' not in link \ and 'mailto:' not in link: tempFrontier.add(self.normalize_url(link, Helper.get_domain(url))) #tempFrontier = tempFrontier - set(self.get_disallowed_sites(url, 'GingerWhiskeyCrawler')) Helper.debug("process end") return tempFrontier
def get_disallowed_sites(self, url, myAgent): Helper.debug("Get disallowed sites 1") domain = Helper.get_domain(url) if domain in self.robots.keys(): return self.robots[domain] try: robot = request.urlopen('http://' + domain + '/robots.txt') Helper.debug(' Fetching robots.txt: '+domain) except: return [] reAgent = re.compile("User-[aA]gent: *(\S+) *$") reDis = re.compile("Disallow: *(/\S*) *$") agent = None disallowed = {} Helper.debug("Get disallowed sites 2") for line in robot: l = str(line).replace("\\n", "").replace("\\r", "")[:-1] if reAgent.findall(l): agent = reAgent.findall(l)[0] disallowed[agent] = [] if reDis.findall(l): if agent in disallowed: disallowed[agent].append(reDis.findall(l)[0]) Helper.debug("Get disallowed sites 3") result = [] if myAgent in disallowed: for link in disallowed[myAgent]: result.append(link) # self.normalize_url(link, domain)) if '*' in disallowed: for link in disallowed['*']: result.append(link) # self.normalize_url(link, domain)) Helper.debug("Get disallowed sites 4") self.robots[domain] = result return result
def get_disallowed_sites(self, url, myAgent): Helper.debug("Get disallowed sites 1") domain = Helper.get_domain(url) if domain in self.robots.keys(): return self.robots[domain] try: robot = request.urlopen('http://' + domain + '/robots.txt') Helper.debug(' Fetching robots.txt: ' + domain) except: return [] reAgent = re.compile("User-[aA]gent: *(\S+) *$") reDis = re.compile("Disallow: *(/\S*) *$") agent = None disallowed = {} Helper.debug("Get disallowed sites 2") for line in robot: l = str(line).replace("\\n", "").replace("\\r", "")[:-1] if reAgent.findall(l): agent = reAgent.findall(l)[0] disallowed[agent] = [] if reDis.findall(l): if agent in disallowed: disallowed[agent].append(reDis.findall(l)[0]) Helper.debug("Get disallowed sites 3") result = [] if myAgent in disallowed: for link in disallowed[myAgent]: result.append(link) # self.normalize_url(link, domain)) if '*' in disallowed: for link in disallowed['*']: result.append(link) # self.normalize_url(link, domain)) Helper.debug("Get disallowed sites 4") self.robots[domain] = result return result