Beispiel #1
0
class SpiderRun(object):
    '''
    爬虫调度主程序
    '''
    def __init__(self):
        self.manager = URLManage()
        self.parser = HTMLParse()
        self.downloader = HTMLDownloader()
        self.output = DataOutput()

    def crawl(self, root_url):
        self.manager.add_new_url(root_url)
        while self.manager.has_new_url() and self.manager.old_url_size() < 5:
            try:
                new_url = self.manager.get_new_url()
                html = self.downloader.download(new_url)
                new_urls, data = self.parser.parser(new_url, html)
                self.manager.add_new_urls(new_urls)
                self.output.store_data(data)
                print "抓取URL:", new_url
                print "已经抓取{}个链接".format(self.manager.old_url_size())
            except Exception, e:
                print "Exception is:", e
        self.output.output_html()
        print self.manager.new_url_size()
        print self.output.datas_size()
Beispiel #2
0
 def __init__(self):
     # Initialize the worker node
     # Using BaseManager to register Queue's method and name
     BaseManager.register('get_task_queue')
     BaseManager.register('get_result_queue')
     # Connect to the server (master) node
     server_addr = '127.0.0.1'
     print(('Connect to server %s...' % server_addr))
     # verify the port and authkey and connect through internet
     self.m = BaseManager(address=(server_addr, 8001),
                          authkey='wiki'.encode('utf-8'))
     self.m.connect()
     # Get the queue object
     self.task = self.m.get_task_queue()
     self.result = self.m.get_result_queue()
     # Initialize the HTMLDownloader and HTMLParser
     self.downloader = HTMLDownloader()
     self.parser = HTMLParser()
     print('init finish')
Beispiel #3
0
class Worker(object):
    def __init__(self):
        # Initialize the worker node
        # Using BaseManager to register Queue's method and name
        BaseManager.register('get_task_queue')
        BaseManager.register('get_result_queue')
        # Connect to the server (master) node
        server_addr = '127.0.0.1'
        print('Connect to server %s...' % server_addr)
        # verify the port and authkey and connect through internet
        self.m = BaseManager(address=(server_addr, 8001),
                             authkey='wiki'.encode('utf-8'))
        self.m.connect()
        # Get the queue object
        self.task = self.m.get_task_queue()
        self.result = self.m.get_result_queue()
        # Initialize the HTMLDownloader and HTMLParser
        self.downloader = HTMLDownloader()
        self.parser = HTMLParser()
        print('init finish')

    def crawl(self):
        cnt = 0
        while (True):
            try:
                if not self.task.empty():
                    url = self.task.get()
                    if url == 'end':
                        print('Master node notify worker stop crawling')
                        self.result.put({'new_urls': 'end', 'data': 'end'})
                        return
                    print('worker node is working on URL: %s' %
                          url.encode('utf-8'))
                    content = self.downloader.download(url)
                    new_urls, data = self.parser.parser(url, content)
                    self.result.put({"new_urls": new_urls, "data": data})
                    cnt += 1
                else:
                    if cnt >= 2:
                        break

            except EOFError as e:
                print("Connection to worker node fail")
                return
            except Exception as e:
                print(e)
                print('Crawl fail!')
Beispiel #4
0
class SpiderMan(object):
    def __init__(self):
        self.manager=URLManager()
        self.downloader=HTMLDownloader()
        self.parser=HTMLParser()
        self.output=DataOutput()

    def crwal(self,root_url):
        # 添加入口url
        self.manager.add_new_url(root_url)
        while(self.manager.has_new_url() and self.manager.old_url_size()<100):
            try:
                new_url=self.manager.get_new_url()
                html=self.downloader.download(new_url)
                new_urls,data=self.parser.parser(new_url,html)
                self.manager.add_new_urls(new_urls)
                self.output.store_data(data)
                print("已经抓取%s个连接"%self.manager.old_url_size())
            except Exception:
                print("crwal failed")
        self.output.oupput_html()
Beispiel #5
0
class Spider(object):
    def __init__(self):
        self.manager = URLManager()
        self.downloader = HTMLDownloader()
        self.parser = HTMLParser()
        self.output = DataOutput()

    def crawl(self, root_url, crawl_size):
        self.manager.add_new_url(root_url)
        while self.manager.has_new_url(
        ) and self.manager.old_url_size() < crawl_size:
            try:
                new_url = self.manager.get_new_url()
                html = self.downloader.download(new_url)
                new_urls, data = self.parser.parser(new_url, html)
                self.manager.add_new_urls(new_urls)
                self.output.store_data(data)
                print('%s links crawled' % self.manager.old_url_size())
            except Exception:
                print('crawl failed')
        self.output.output_html()
Beispiel #6
0
class SpinderMan(object):
    def __init__(self):
        self.downloader = HTMLDownloader()
        self.parser = HTMLparser()
        self.manager = URLmanager()
        self.output = StoreDatas()

    def crawl(self, root_url):
        self.manager.add_new_url(root_url)
        while (self.manager.has_new_urls()
               and self.manager.old_urls_length() <= 100):
            try:
                new_url = self.manager.get_new_url()
                html_text = self.downloader.download(new_url)
                newer_url, raw_data = self.parser.parser(new_url, html_text)
                data = self.output.store_datas(raw_data)
                self.manager.add_new_urls(newer_url)


# except Exception as e:
#     print("crawl faild")
            finally:
                self.output.store_as_csv()
Beispiel #7
0
 def __init__(self):
     self.manager = URLManage()
     self.parser = HTMLParse()
     self.downloader = HTMLDownloader()
     self.output = DataOutput()
Beispiel #8
0
 def __init__(self):
     self.downloader = HTMLDownloader()
     self.parser = HTMLparser()
     self.manager = URLmanager()
     self.output = StoreDatas()
Beispiel #9
0
# _*_coding:utf-8 _*_
from DataOutput import DataOutput
from HTMLDownloader import HTMLDownloader
from HTMLParse import HTMLParse
from URLManage import URLManage
from bs4 import BeautifulSoup
import re
s = HTMLDownloader()

t = s.download(
    'https://baike.baidu.com/item/%E7%BD%91%E7%BB%9C%E7%88%AC%E8%99%AB')

soup = BeautifulSoup(t, 'lxml')

new_urls = set()
#抽取a标记中的URL
links = soup.find_all('a', href=re.compile(
    r'/item'))  #, recursive=True) #href=re.compile(r'\"/item/(.*?)\"'))
for link in links:
    print link
    with open("1.txt", 'a+') as f:
        f.write(str(link))
        f.write("\n")