コード例 #1
0
ファイル: crawler.py プロジェクト: jamesduan/webcrawler
    def getHyperLinks(self,url):

        page = {}
        links = []
        title_contents = []

        cfg = genCfg()

        server_ip = cfg.get('agent', 'server_ip')
        server_port = cfg.get('agent', 'port')
        server_protocol = cfg.get('agent', 'protocol')
        api = cfg.get('agent', 'api')

        data = self.getPageSource(url)

        #print "got html data."
        #print data

        if data[0] == "200":
            try:
                soup = BeautifulSoup(data[1], "html.parser")

                #origin_links = []
                #for d_addr in soup.find_all('a'):
                #    origin_links.append(d_addr.get('href'))

                #print origin_links

                #for link in map(self.handle_origin_links, origin_links):

                #    if link:
                #        links.append(link)
                http_addr = soup.select('a[href^=http://]')
                https_addr = soup.select('a[href^=https://]')

                links += map(self.handle_origin_addr, http_addr) + \
                    map(self.handle_origin_addr, https_addr)

#                links = list(set(links))

                #print links
            except ImportError, e:
                print red('Please install BeautifulSoup4 module first.')

            try:
                #code_type = sys.getfilesystemencoding()
                #print "code type: ", code_type
                page[url] = data[1]
                #print page
                inner_server_response = send_request(''.join([server_protocol,
                                                    "://", server_ip, ':',
                                                    server_port, api]), 'POST',
                                                    json.dumps(page))

                print "-----send reques -----for upload.."
                if inner_server_response.body is "200":
                    print "upload html docs done."

            except ValueError, e:
                print "get HyperLinks Value error: ", e
コード例 #2
0
ファイル: main.py プロジェクト: jamesduan/webcrawler
import sys, ConfigParser

from colors import red

from crawler import Crawler

from common.configuration import genCfg

def main(seeds,crawl_count):

    craw=Crawler(seeds)
    craw.crawling(crawl_count)

if __name__ == "__main__":

    cfg = genCfg()

    try:
        seeds = cfg.get('agent', 'seeds').split(',')
        crawler_count = cfg.get('agent', 'crawler_count')
        main(seeds, int(crawler_count))

    except KeyboardInterrupt, interrupt:
        print red("Cancelled by user type Ctrl+c ")
        sys.exit(1)
    except ValueError, err:
        print "value errr: ", err
        sys.exit(1)
    except Exception, e:
        print "Unkown exception occurred!", e
        sys.exit(1)