def getHyperLinks(self,url): page = {} links = [] title_contents = [] cfg = genCfg() server_ip = cfg.get('agent', 'server_ip') server_port = cfg.get('agent', 'port') server_protocol = cfg.get('agent', 'protocol') api = cfg.get('agent', 'api') data = self.getPageSource(url) #print "got html data." #print data if data[0] == "200": try: soup = BeautifulSoup(data[1], "html.parser") #origin_links = [] #for d_addr in soup.find_all('a'): # origin_links.append(d_addr.get('href')) #print origin_links #for link in map(self.handle_origin_links, origin_links): # if link: # links.append(link) http_addr = soup.select('a[href^=http://]') https_addr = soup.select('a[href^=https://]') links += map(self.handle_origin_addr, http_addr) + \ map(self.handle_origin_addr, https_addr) # links = list(set(links)) #print links except ImportError, e: print red('Please install BeautifulSoup4 module first.') try: #code_type = sys.getfilesystemencoding() #print "code type: ", code_type page[url] = data[1] #print page inner_server_response = send_request(''.join([server_protocol, "://", server_ip, ':', server_port, api]), 'POST', json.dumps(page)) print "-----send reques -----for upload.." if inner_server_response.body is "200": print "upload html docs done." except ValueError, e: print "get HyperLinks Value error: ", e
def getPageSource(self, url, timeout=100, coding=None): try: http_response = send_request(url, 'GET') if http_response.code == 200: #print "page : " , utf8(http_response.body) return ["200",http_response.body] except Exception,e: print str(e) return [str(e),None]
from request_sender import send_request print send_request('http://localhost:8888/upload_html', 'POST', "{'name' : 'jamesduan'}").body