def getHyperLinks(self,url): page = {} links = [] title_contents = [] cfg = genCfg() server_ip = cfg.get('agent', 'server_ip') server_port = cfg.get('agent', 'port') server_protocol = cfg.get('agent', 'protocol') api = cfg.get('agent', 'api') data = self.getPageSource(url) #print "got html data." #print data if data[0] == "200": try: soup = BeautifulSoup(data[1], "html.parser") #origin_links = [] #for d_addr in soup.find_all('a'): # origin_links.append(d_addr.get('href')) #print origin_links #for link in map(self.handle_origin_links, origin_links): # if link: # links.append(link) http_addr = soup.select('a[href^=http://]') https_addr = soup.select('a[href^=https://]') links += map(self.handle_origin_addr, http_addr) + \ map(self.handle_origin_addr, https_addr) # links = list(set(links)) #print links except ImportError, e: print red('Please install BeautifulSoup4 module first.') try: #code_type = sys.getfilesystemencoding() #print "code type: ", code_type page[url] = data[1] #print page inner_server_response = send_request(''.join([server_protocol, "://", server_ip, ':', server_port, api]), 'POST', json.dumps(page)) print "-----send reques -----for upload.." if inner_server_response.body is "200": print "upload html docs done." except ValueError, e: print "get HyperLinks Value error: ", e
import sys, ConfigParser from colors import red from crawler import Crawler from common.configuration import genCfg def main(seeds,crawl_count): craw=Crawler(seeds) craw.crawling(crawl_count) if __name__ == "__main__": cfg = genCfg() try: seeds = cfg.get('agent', 'seeds').split(',') crawler_count = cfg.get('agent', 'crawler_count') main(seeds, int(crawler_count)) except KeyboardInterrupt, interrupt: print red("Cancelled by user type Ctrl+c ") sys.exit(1) except ValueError, err: print "value errr: ", err sys.exit(1) except Exception, e: print "Unkown exception occurred!", e sys.exit(1)