コード例 #1
0
ファイル: crawler.py プロジェクト: jamesduan/webcrawler
    def getHyperLinks(self,url):

        page = {}
        links = []
        title_contents = []

        cfg = genCfg()

        server_ip = cfg.get('agent', 'server_ip')
        server_port = cfg.get('agent', 'port')
        server_protocol = cfg.get('agent', 'protocol')
        api = cfg.get('agent', 'api')

        data = self.getPageSource(url)

        #print "got html data."
        #print data

        if data[0] == "200":
            try:
                soup = BeautifulSoup(data[1], "html.parser")

                #origin_links = []
                #for d_addr in soup.find_all('a'):
                #    origin_links.append(d_addr.get('href'))

                #print origin_links

                #for link in map(self.handle_origin_links, origin_links):

                #    if link:
                #        links.append(link)
                http_addr = soup.select('a[href^=http://]')
                https_addr = soup.select('a[href^=https://]')

                links += map(self.handle_origin_addr, http_addr) + \
                    map(self.handle_origin_addr, https_addr)

#                links = list(set(links))

                #print links
            except ImportError, e:
                print red('Please install BeautifulSoup4 module first.')

            try:
                #code_type = sys.getfilesystemencoding()
                #print "code type: ", code_type
                page[url] = data[1]
                #print page
                inner_server_response = send_request(''.join([server_protocol,
                                                    "://", server_ip, ':',
                                                    server_port, api]), 'POST',
                                                    json.dumps(page))

                print "-----send reques -----for upload.."
                if inner_server_response.body is "200":
                    print "upload html docs done."

            except ValueError, e:
                print "get HyperLinks Value error: ", e
コード例 #2
0
ファイル: crawler.py プロジェクト: jamesduan/webcrawler
    def getPageSource(self, url, timeout=100, coding=None):

        try:

            http_response = send_request(url, 'GET')

            if http_response.code == 200:
                #print "page : " , utf8(http_response.body)
                return ["200",http_response.body]

        except Exception,e:
            print str(e)
            return [str(e),None]
コード例 #3
0
ファイル: test.py プロジェクト: jamesduan/webcrawler

from request_sender import send_request


print send_request('http://localhost:8888/upload_html', 'POST', "{'name' : 'jamesduan'}").body