Esempio n. 1
0
class Spider(object):
    def __init__(self):
        self.manage = UrlManager()

    def spider(self, url, param):
        page_num = HtmlParser.get_page_num(url)
        print('page_num:', page_num)
        with open('./name.csv', 'a') as csvfile:
            fielddnames = ['title', 'url', 'down']
            write = csv.DictWriter(csvfile, fieldnames=fielddnames)
            write.writeheader()
        for i in range(1, page_num + 1):
            page_url = url + param + str(i)
            print(page_url)
            new_urls = HtmlParser.get_page_urls(page_url)
            self.manage.add_new_urls(new_urls)
            while self.manage.has_new_url():
                try:
                    new_url = self.manage.get_new_url()
                    data = HtmlParser.get_data(new_url)
                    DataOutput.write_data(data)
                    print(data)
                except Exception as e:
                    print('抓取失败!error:', e)
            print('已经抓取{}条数据'.format(self.manage.old_urls_size()))
class SpiderMain(object):
    def __init__(self):
        self.manager = UrlManager()
        self.downloader = FileDownLoader()
        self.parser = FileParser()
        self.output = DataOutput()

    def crawl(self, root_files):

        for root_file in root_files:
            new_urls = self.parser.parser(root_file)
            self.manager.add_new_urls(new_urls)

            while (self.manager.has_new_url()):
                try:
                    new_url = self.manager.get_new_url()
                    data = self.downloader.download(new_url)
                    self.output.store_data(data, root_file, new_url)
                    print("已经抓取%s个链接" % self.manager.old_url_size())

                    interval = random.randint(1, 3)

                    time.sleep(interval)
                    print("sleep: %d" % interval)

                except Exception as err:
                    self.output.mark_result(root_file, new_url, False)
                    print("crawl faild:" + str(err))
Esempio n. 3
0
    def __init__(self):
        self.manager = UrlManager()
        self.downloader = HtmlDownloader()
        self.parser = HtmlParser()
        self.output = DataOutput()

        pass
Esempio n. 4
0
class Main(object):
    def __init__(self):
        self.manager = UrlManager()
        self.downloader = HttpDownloader()
        self.parser = ContentParser()

    def process_solo(self, name_id, fold):
        # next_page: //tieba.baidu.com/f?kw=%E5%A5%B3%E4%BA%BA&ie=utf-8&pn=50
        solo_ba = 'https:'
        url = solo_ba + name_id

        response = self.downloader.download(url)
        p_lists, next_page, current_page = self.parser.parser_solo_ba(response)

        print('第', current_page, '页帖子,', 'next_page:', next_page, p_lists)
        for p in p_lists:
            self.process_p(p[0], p[1], fold)
        if next_page is not None:
            self.process_solo(next_page, fold)
        else:
            print('HAPPY!  program finish!!!!!')

    def process_p(self, page):
        # http://scxx.whfcj.gov.cn/xmqk.asp?page=1
        base_url = 'http://scxx.whfcj.gov.cn/xmqk.asp?page='
        url = base_url + str(page)

        self.manager.save_url(url)
        while self.manager.has_url():
            next_url = self.manager.next_url()
            print('detail_p :', next_url)
            response = self.downloader.download(next_url)
            self.parser.parser_detail_p(response)
Esempio n. 5
0
class MainSearch(object):
    def __init__(self):
        self.manager = UrlManager()
        self.downloader = HttpDownloader()
        self.parser = ContentParser()

    def process_page(self, name_id):
        solo_ba = 'http://weixin.sogou.com/weixin'
        url = solo_ba + name_id
        response = self.downloader.download(url)
        a = self.parser.parser_solo_ba(response)
        if a is None:
            return
        p_lists, next_page, current_page = a

        print('Page:', current_page, ', ListSize:', len(p_lists))
        if len(p_lists) > 0:
            for p in p_lists:
                self.process_article(p)
        if next_page is not None:
            self.process_page(next_page)

    def process_article(self, url):
        self.manager.save_url(url)
        while self.manager.has_url():
            next_url = self.manager.next_url()
            print('detail_article :', next_url)
            response = self.downloader.download(next_url)
            self.parser.parser_detail_p(response)
Esempio n. 6
0
def main():
    idi = 1405150114
    urlmanager = UrlManager()
    pageur = urlmanager.url_login(idi)
    infourl = urlmanager.url_userinfo(idi)
    htmldownloader = HtmlDownloader()
    htmldownloader.download(
        'http://ygcp.njtech.edu.cn/User/LoginInSampleAction.aspx', idi, pageur,
        infourl)
Esempio n. 7
0
 def __init__(self):
     #创建一个url管理器
     self.urlManager = UrlManager()
     #创建一个html下载器
     self.downloader = htmlDownloader()
     #创建一个html的解析器
     self.htmlparser = htmlParser()
     #创建一个html的存储器
     self.htmlSave = htmlSave()
Esempio n. 8
0
    def __init__(self, sort, sort_url, sortFilename):
        threading.Thread.__init__(self)
        self.sort = sort
        self.sort_url = sort_url
        self.sortFilename = sortFilename

        self.manager = UrlManager(self.sort)
        self.downloader = HtmlDownloader()
        self.parser = HtmlParser()
        self.output = DataOutput()
Esempio n. 9
0
    def __init__(self, bind_domain):

        # 建立管理爬取URL的物件 , 用于记录已经爬过的URL
        self.urlManager = UrlManager(enable_external_link=False,
                                     bind_domain=bind_domain)

        # 建立请求链接的物件
        self.downloader = HtmlDownloader()

        # 建立转换Html源码成lxml.html物件 , 获取新的链接
        self.parser = HtmlParser()
Esempio n. 10
0
class SpiderMan(object):
    def __init__(self):
        self.manager = UrlManager()
        self.downloader = HtmlDownloader()
        self.parser = HtmlParser()
        self.output = DataOutput()

    def crawl(self, root_url):
        # 添加入口URL
        self.manager.add_new_url(root_url)
        # 判断url管理器中是否有新的url,同时判断抓取多少url
        while (self.manager.has_new_url()
               and self.manager.old_url_size() < 100):
            # try:
            # 从URL管理器获取新的url
            new_url = self.manager.get_new_url()
            # HTML下载器下载网页
            html = self.downloader.download(new_url)
            # HEML解析器抽取网页数据
            new_urls, data = self.parser.parser(new_url, html)
            # 将抽取的url添加到URL管理器中
            self.manager.add_new_urls(new_urls)
            # 数据存储器存储文件
            self.output.store_data(data)
            print("已经抓取%s个链接" % self.manager.old_url_size())
        # except Exception as e:
        # 	print(e)
        # 	print("Crawl failed")
        # 数据存储器将文件输出成指定格式
        self.output.output_html()
Esempio n. 11
0
class SpiderMan(object):
    def __init__(self):
        self.manager = UrlManager()
        self.downloader = HtmlDownloader()
        self.parser = HtmlParser()
        self.output = DataOutput()

    def crawl(self, root_url):
        # 添加入口URL
        self.manager.add_new_url(root_url)
        # 判断是否有新的URL及已抓取数量
        while (self.manager.has_new_url()
               and self.manager.old_url_size() < 100):
            try:
                # 从URL管理器获取新的链接
                new_url = self.manager.get_new_url()
                print '1-------->new_url', new_url
                # 下载网页
                html = self.downloader.download(new_url)
                print '2-------->html'
                # 解析抽取网页
                new_urls, data = self.parser.parser(new_url, html)
                print '3-------->new_urls, data', new_urls, data
                # 将抽取的URL添加到管理器中
                self.manager.add_new_urls(new_urls)
                print '4-------->new_urls', new_urls
                # 数据存储器存储文件
                self.output.store_data(data)
                print '已经抓取%d个链接' % self.manager.old_url_size()
            except Exception, e:
                print 'crawl failed %s' % e
        # 将数据存储为指定格式
        self.output.output_html()
Esempio n. 12
0
class SpiderMan(object):
    def __init__(self):
        self.manager = UrlManager()
        self.downloader = HtmlDownloader()
        self.parser = HtmlParser()
        self.output = DataOutput()

    def start(self, url, numMax=50):
        self.manager.addUrl(url)
        num = 0
        errorsNum = 0
        while self.manager.sizeofNew() != 0 and num < numMax:
            try:
                num = num + 1
                url = self.manager.getUrl()
                print('%d\n %s' % (num, url))
                html = self.downloader.download(url)
                newUrls, data = self.parser.parser(url, html)
                self.output.addData(data)
                if self.manager.sizeofNew() + self.manager.sizeofOld(
                ) < numMax:
                    self.manager.addUrls(newUrls)
                print(data['title'])
            except:
                num = num - 1
                errorsNum = errorsNum + 1
                print('crawl failed %d' % errorsNum)
        self.output.outputData()
Esempio n. 13
0
class SpiderMan(object):
    def __init__(self):
        self.manger = UrlManager()
        self.download = HtmlDownload()
        self.parse = HtmlParse()
        self.outpu = DataOuput()

    def crawl(self, root_url):
        '''
        添加入口url
        :param root_url:
        :return:
        '''
        self.manger.add_new_url(root_url)
        # 判断url管理器中是否有新的url地址,同时判断抓取了多少个url
        while (self.manger.has_new_url() and self.manger.old_url_size() < 100):
            try:
                # 从URL管理器中获取新的URL地址
                new_url = self.manger.get_new_url()
                # html下载器进行页面下载
                html = self.download.download(new_url)
                # html解析获取数据
                new_urls, data = self.parse.parse(new_url, html)
                # 将获取到的url地址添加到url管理器中
                self.manger.add_new_urls(new_urls)
                # 数据存储
                self.outpu.store_data(data)
                self.outpu.ouput_html()
                print('已抓取%s个链接' % self.manger.old_url_size())
            except Exception as e:
                print('crawl fail', e)
Esempio n. 14
0
 def craw(self, root_url):
     # todo: 层数设置?
     page_count = self.page_count
     UrlManager.add_new_url(root_url)
     try:
         while UrlManager.has_new_url():
             todo_url = UrlManager.get_new_url()
             try:
                 print u"\nNO: %d 正在检测链接:%s" % (page_count, todo_url)
             except:
                 print u"出错", todo_url
                 # todo: 换成 contiune 是不是更好?
             # page_status, bad_links = Downloader.test(todo_url)
             # print u"\n结果:%s" % page_status
             # todo: 加入状态码的说明
             state, content = Downloader.download(todo_url)
             if state:
                 prase_state, new_urls = Parser.parse(todo_url, content)
                 if prase_state:
                     UrlManager.add_new_urls(new_urls)
                 else:
                     Outputer.collect_data(new_urls)
             else:
                 Outputer.collect_data(content)
             page_count += 1
             # except:
             #     print u"页面爬取失败"
             #     UrlManager.add_wrong_url(todo_url)
             # todo:测试代码
             if page_count == 5000:
                 self.page_count = page_count
                 break
             print UrlManager.num_new_url()
     finally:
         Outputer.output_txt(self.page_count)
Esempio n. 15
0
class Main(object):
    def __init__(self):
        self.manager = UrlManager()
        self.downloader = HttpDownloader()
        self.parser = ContentParser()
        self.num = 1

    def process_solo(self, name, fold):
        # next_page: //tieba.baidu.com/f?kw=%E5%A5%B3%E4%BA%BA&ie=utf-8&pn=50
        ba = 'https:'
        url = ba + name

        response = self.downloader.download(url)
        p_lists, next_page, current_page = self.parser.parser_solo_ba(response)

        print('第', current_page, '页帖子,', 'next_page:', next_page, p_lists)
        for p in p_lists:
            self.process_p(p[0], p[1], fold)
        if next_page is not None:
            self.process_solo(next_page, fold)
        else:
            print('HAPPY!  program finish!!!!!')

    def process_p(self, title, short_url, ba_name):
        # http://tieba.baidu.com/p/5287680253
        base_url = 'http://tieba.baidu.com'
        if self.num is not 1:
            url = base_url + short_url + "?pn=" + str(self.num)
        else:
            url = base_url + short_url

        self.manager.save_url(url)
        while self.manager.has_url():
            next_url = self.manager.next_url()
            print('detail_p :', next_url)
            response = self.downloader.download(next_url)
            new_img_urls, big_img_urls, total_num = self.parser.parser_detail_p(
                response)

            # 'http://imgsrc.baidu.com/forum/pic/item/'
            self.downloader.load_imgs(big_img_urls, title, ba_name, True)

            self.num = self.num + 1

            if self.num > int(total_num):
                self.num = 1
                return
            else:
                self.process_p(title, short_url, ba_name)
Esempio n. 16
0
class Spider(object):
    def __init__(self):
        self.manager = UrlManager()
        self.downloader = HtmlDownloader()
        self.paser = HtmlParser()

    def crawl(self, idi):
        rootloginurl = 'http://ygcp.njtech.edu.cn/User/LoginInSampleAction.aspx'
        pageurl = self.manager.url_login(idi)
        infourl = self.manager.url_userinfo(idi)
        htmlf, htmli = self.downloader.download(rootloginurl, idi, pageurl,
                                                infourl)
        xuehao, xingming, changpao, chenpao = self.paser.parser(
            infourl, pageurl, htmli, htmlf)
        print("学号:" + xuehao[0], "姓名:" + xingming[0], changpao, chenpao)
Esempio n. 17
0
class SpiderMan(object):
    """爬虫调度器"""
    def __init__(self):
        self.urlManager = UrlManager()
        self.htmlDownloader = HtmlDownloader()
        self.htmlParser = HtmlParser()
        self.htmlOutput = DataOutput()

    def crawl(self, root_url):
        # 添加入口URL
        self.urlManager.add_new_url(root_url)
        # 判断url管理器中是否有新的url,同时判断抓取了多少个url
        while (self.urlManager.has_new_url()
               and self.urlManager.old_url_size() < 100):
            try:
                # 从URL管理器获取新的url
                new_url = self.urlManager.get_new_url()
                # HTML下载器下载网页
                html = self.htmlDownloader.download(new_url)
                # HTML解析器抽取网页数据
                new_urls, data = self.htmlParser.parser(new_url, html)
                # 将抽取的url添加到URL管理器中
                self.urlManager.add_new_urls(new_urls)
                # 数据存储器存储数据
                self.htmlOutput.store_data(data)
            except Exception as e:
                print(traceback.format_exc())
        # 数据存储器将文件输出成指定格式
        self.htmlOutput.output_html()
Esempio n. 18
0
class SpiderMan(object):
    def __init__(self):
        self.manager = UrlManager()
        self.downloader = HtmlDownloader()
        self.parser = HtmlParser()
        self.output = DataOutput()

    def crawl(self, root_url, dir, logFile):
        # 添加入口URL
        self.manager.add_new_url(root_url)
        # 判断url管理器中是否有新的url,同时判断抓取了多少个url
        while (
                # self.manager.has_new_url() and self.manager.old_url_size() < 2):
                self.manager.has_new_url()):
            try:
                # 从URL管理器中获取新的url
                new_url = self.manager.get_new_url()
                # HTML下载器下载网页
                html = self.downloader.download(new_url)
                # HTML解析器抽取网页数据
                new_urls, data = self.parser.parser(new_url, html)
                # 将抽取的url添加到URL管理器中
                self.manager.add_new_urls(new_urls)
                # 数据存储器存储文件
                self.output.store_data(data)
                print u"已经抓取%s个链接" % (self.manager.old_url_size())
            except Exception, e:
                print e
                print "crawl failed"
        # 数据存储器将文件输出成指定格式
        self.output.output_html(dir, logFile)
Esempio n. 19
0
    def __init__(self):
        self.G_STATE_OK = 200
        self.crawMaxNum = -1
        self.crawCountNum = 0

        self.urlManager = UrlManager()
        self.dispatch = Dispatch()
        self.htmlParser = HtmlParser("http://baike.baidu.com")
        self.applicationShow = ApplicationShow()
Esempio n. 20
0
 def __init__(self, url_argv):
     sys.setrecursionlimit(10000000)
     """ 调度数据库接口, 引入初始化, 调度器, 爬取器, 分析器 """
     self.db = DbManager.DbManager(db_config).mysql_connect()
     self.config = spider_config.spider_config()
     self.initialization = Initialization.Initialization(
         self.db, self.config, url_argv)
     self.manager = UrlManager.UrlManager(self.db, self.config)
     self.craw = UrlCraw.UrlCraw(self.db, self.config)
     self.analyse = UrlAnalyse.UrlAnalyse(self.db, self.config)
     self.sprint = SpiderPrint.SpiderPrint()
     self.initialize_spider()
Esempio n. 21
0
class manage(object):
    def __init__(self):
        #创建一个url管理器
        self.urlManager = UrlManager()
        #创建一个html下载器
        self.downloader = htmlDownloader()
        #创建一个html的解析器
        self.htmlparser = htmlParser()
        #创建一个html的存储器
        self.htmlSave = htmlSave()

    def action(self):
        #给url管理器设置一个根url地址
        root_url = "https://baike.baidu.com/item/网络爬虫"
        self.urlManager.add_new_url(root_url)
        n = 0
        #询问url管理器是否有待取的url
        while self.urlManager.has_new_url() and n <= 100:
            n += 1
            #获取一个未被抓取的url地址
            new_url = self.urlManager.get_new_url()
            #把url交给下载器去下载html代码
            htmlStr = self.downloader.download(new_url)
            #把htmlStr字符串交给解析器去解析,解析器返回一个元祖
            #元祖的第一值是和当前页面关联的所有的url,是一个set集合
            #元祖的第二个值是当前页面的数据,是一个字典
            urls, data = self.htmlparser.parser(new_url, htmlStr)
            #把urls交给url管理器
            self.urlManager.add_new_urls(urls)
            #把数据交给数据存储器
            self.htmlSave.saveData(data)
            print "第%s个页面的数据" % n
        self.htmlSave.output()
Esempio n. 22
0
class Spider(object):
    def __init__(self):
        print 'init'
        self.urlManager = UrlManager()
        self.downloader = Downloader()
        self.praser = HtmlPraser()
        self.outputer = Output()

    def craw(self, rootUrl):
        self.urlManager.addUrl(rootUrl)
        count = 1

        while self.urlManager.hasNewUrl():
            newUrl = self.urlManager.getNewUrl()
            print '爬取第', count, '个url,url是:', newUrl
            htmlContent = self.downloader.download(newUrl)
            newUrls, newData = self.praser.praser(newUrl, htmlContent)
            self.urlManager.addUrls(newUrls)
            self.outputer.collect(newData)

            if count == 10:
                break

            count = count + 1

        self.outputer.output()
Esempio n. 23
0
class Spider(object):
    def __init__(self):
        self.manage = UrlManager()
        self.output = DataOutput()
        self.parse = HtmlParser()

    def crawl(self):
        print(self.parse.page_num)
        for i in range(1, self.parse.page_num + 1):
            new_urls = self.parse.get_page_urls(i)
            print(new_urls)
            self.manage.add_new_urls(new_urls)
            while self.manage.has_new_url():
                new_url = ''
                try:
                    new_url = self.manage.get_new_url()
                    print(new_url)
                    data = self.parse.get_data(new_url)
                    print(data)
                    self.output.save_mongo(data)
                    time.sleep(1)
                except Exception as e:
                    print('抓取失败:', new_url, e)
        print('已经抓取{}条数据'.format(self.output.data_size()))
Esempio n. 24
0
def start_server(path):
    server = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
    server.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
    try:
        server.bind((HOST, PORT))
        server.listen(100)
        print('bind %s,ready to use' % PORT)
    except:
        print('Server is already running, quit')
        sys.exit()
    i = 0
    while True:
        connection, address = server.accept()
        username = address[0]
        connectionlist['connection' + str(i)] = connection
        if handshake(connection):
            print('handshake success')
            try:
                manager = UrlManager(connection, r'http://www.meitulu.com', '', path, True);
                manager.start()
            except:
                print('start new thread error')
                connection.close()
        i += 1
Esempio n. 25
0
class SpiderMain(object):
    def __init__(self):
        self.manager=UrlManager()
        self.downloader=HtmlDownloader()
        self.parser=HtmlParser()
        self.output=DataOutput()
    def crawl(self,root_url):
        self.manager.add_new_url(root_url)
        while(self.manager.has_new_url() and self.manager.old_url_size()<100):
            try:
                new_url=self.manager.get_new_url()
                html=self.downloader.download(new_url)
                new_urls,data=self.parser.parser(new_url,html)
                self.manager.add_new_urls(new_urls)
                self.output.store_data(data)
                print("已经抓取%s个链接"%self.manager.old_url_size())
            except Exception as e:
               print(e)
        self.output.output_html()
Esempio n. 26
0
class SpiderMan(object):
    def __init__(self):
        self.manager = UrlManager()
        self.downloader = HtmlDownloader()
        self.parser = HtmlParser()

    def crawl(self, root_url):
        self.manager.add_new_url(root_url)
        while (self.manager.has_new_url()):
            # try:
            new_url = self.manager.get_new_url()
            html = self.downloader.download(new_url)
            new_urls, data = self.parser.parser(new_url, html)
            self.manager.add_new_urls(new_urls)
Esempio n. 27
0
class SpiderMan(object):
    def __init__(self):
        self.manager = UrlManager()
        self.downloader = HtmlDownloader()
        self.parser = HtmlParser()
        self.output = DataOutput()

    def get_url_page(self, url):
        pattern1 = re.compile('page=\d+')
        pattern2 = re.compile('\d+')
        rx = re.search(pattern1, url)
        rxx = re.search(pattern2, rx.group())
        url_page = int(rxx.group())
        return url_page

    def crawl(self, root_url):
        #添加入口url
        self.manager.add_new_url(root_url)
        page_number = self.get_url_page(root_url)
        #判断url管理器是否有新url,同时判断抓取了多少个url
        while (self.manager.has_new_urls()
               and self.manager.old_url_size() < 163):
            try:
                new_url = self.manager.get_new_urls()
                print(new_url)
                html = self.downloader.download(new_url)
                page_number += 1
                print('page=%s' % page_number)
                new_urls, data = self.parser.parser(new_url, html, page_number)
                #print(new_urls)
                self.manager.add_new_urls(new_urls)
                self.output.store_data(data)
                print("已抓取%s个链接" % self.manager.old_url_size())
            except Exception as e:
                print("crawl failed")
                break
        print(self.output.datas)
        self.output.output_html()
class MenetSpider:
    def __init__(self):
        self.downloader = HtmlDownloader()
        self.urlmanager = UrlManager()
        self.parser = HtmlParser()
        self.output = DataOutput()

    def crawl(self, i):
        try:
            print(f"Process {i} is running")
            url = self.urlmanager.get_new_url(i)
            html = self.downloader.download(url)
            data = self.parser.parser(html)
            return data
        except:
            print(f"crawl failed at {i}")
            return pd.DataFrame([0, 0, 0, 0, 0, 0, 0, 0, 0],
                                columns=[
                                    '编码', "药品名称", "生产企业", "批文文号", "商品名", "剂型",
                                    "规格", "进口国产", "批准日期"
                                ])
Esempio n. 29
0
    def url_manager_process(self, task_queue):
        '''
        url管理器进程
        :param task_queue:   url队列
        :param conn_queue:
        :param root_url:    起始url
        :return:
        '''
        sql = 'SELECT id,bname FROM ' + TABLE_NAME + ' WHERE bdoubanlink IS NULL OR bdoubanlink=""'
        url_manager = UrlManager()
        db = MysqlHelper(DATABASE_NAME)

        while True:
            if not url_manager.has_new_url():
                datas = db.select(sql)
                if datas:
                    for data in datas:
                        task_data = str(data[0])+'$$'+data[1].strip()
                        url_manager.add_new_url(task_data)
                    print('[√]  Datas has been read from database!')
                else:
                    print('[!]  Fetch database null.')
                    exit(-1)

            # 添加爬虫结束条件
            if not url_manager.has_new_url():
                # 通知节点停止工作
                task_queue.put('end')
                print('[·]  Controler send "end" command.')
                return

            while task_queue.qsize() < _config.QUEUE_NUM and url_manager.has_new_url():
                # 从url管理器获取新的url
                new_url = url_manager.get_new_url()
                # 将url分发下去
                task_queue.put(new_url)
                print('[+]  >>> %s' % new_url)
Esempio n. 30
0
class SpiderMan(object):
    def __init__(self):
        self.manager = UrlManager()
        self.downloader = HtmlDownloader()
        self.parser = HtmlParser()
        self.output = DataOutput()

    def crawl(self, root_url):
        # 添加入口URL
        self.manager.add_new_url(root_url)
        #判断url管理器中是否有新的url,同时判断抓取饿多少个url
        while (self.manager.has_new_url()
               and self.manager.old_url_size() < 10000):
            try:
                # 从URL管理器中获取新的url
                new_url = self.manager.get_new_url()
                # HTML下载器下载网页
                html = self.downloader.dowload(new_url)
                """
                with open(r"%s.html"%self.manager.old_url_size(), 'wb') as f:
                    f.write(html)
                    f.flush()
                
                """
                # HTML解析器抽取网页数据
                new_urls, data = self.parser.parser(new_url, html)
                # 将抽取的url添加到url管理器中

                self.manager.add_new_urls(new_urls)
                # 数据存储器存储文件
                self.output.store_data(data)
                print "已经抓取%s个连接" % self.manager.old_url_size()
            except Exception, e:
                print e
                print "crawl failed"
            #数据存储器将文件输出成指定格式
        self.output.output_html()
Esempio n. 31
0
 def url_manager_proc(self, url_q):
     '''
     url管理进程将url_q中的待爬取城市传递给爬虫节点
     :param url_q:管理进程通将url传递给爬虫节点的通道
     :return:
     '''
     url_manager = UrlManager()
     while True:
         while (url_manager.has_new_url()):
             # 从URL管理器获取新的url
             new_url = url_manager.get_new_url()
             # 将新的URL发给工作节点
             url_q.put(new_url)
         # 通知爬虫节点停止工作
         url_q.put('end')
         # 关闭管理节点,同时存储set状态
         url_manager.save_progress('new_city.txt', url_manager.new_urls)
         url_manager.save_progress('old_city.txt', url_manager.old_urls)
         return
Esempio n. 32
0
class Main(object):
    def __init__(self):
        self.manager = UrlManager()
        self.downloader = HttpDownloader()
        self.parser = ContentParser()

    def process_solo(self, name_id, fold):
        # next_page: //tieba.baidu.com/f?kw=%E5%A5%B3%E4%BA%BA&ie=utf-8&pn=50
        solo_ba = 'https:'
        url = solo_ba + name_id

        response = self.downloader.download(url)
        p_lists, next_page, current_page = self.parser.parser_solo_ba(response)

        print('第', current_page, '页帖子,', 'next_page:', next_page, p_lists)
        for p in p_lists:
            self.process_p(p[0], p[1], fold)
        if next_page is not None:
            self.process_solo(next_page, fold)
        else:
            print('HAPPY!  program finish!!!!!')

    def process_p(self, short_url):
        # https://zhuanlan.zhihu.com/p/26647066
        base_url = 'https://zhuanlan.zhihu.com'
        url = base_url + short_url

        self.manager.save_url(url)
        while self.manager.has_url():
            next_url = self.manager.next_url()
            print('detail_p :', next_url)
            response = self.downloader.download(next_url)
            print(response)
            title, img_urls, links = self.parser.parser_detail_p(response)
            if title is not None and img_urls is not None:
                self.downloader.load_imgs(title, img_urls)
            if links is not None:
                self.manager.save_urls(links)