Exemple #1
0
 def __init__(self, url):
     self.root_url = url
     self.urlManager = UrlManager()
     self.dLoader = HtmlDLoader()
     self.contParser = HtmlParser()
     self.contOutputer = HtmlOutputer()
     pass
 def __init__(self):
     """构造函数,初始化属性"""
     self.urls = UrlManager()
     self.log = MyLog("spider_main", "logs")
     self.downloader = HtmlDownloader()
     self.parser = HtmlParser()
     self.outputer = HtmlOutputer()
Exemple #3
0
class SpiderMain(object):
    def __init__(self):
        self.urls = UrlManager()
        self.downloader = HtmlDownloader()
        self.parser = HtmlParser()
        self.outputer = HtmlOutputer()

    def craw(self, url):
        count = 1
        self.urls.add_new_url(url)
        while self.urls.has_new_url():
            try:
                new_url = self.urls.get_new_url()
                html_cont = self.downloader.download(new_url)
                new_urls, html_data = self.parser.parse(new_url, html_cont)
                self.urls.add_new_urls(new_urls)
                self.outputer.collect_data(html_data)
                print "%d craw success : %s" % (count, new_url)
                if count >= 10:
                    break
                count = count + 1
            except Exception as e:
                print str(e)
                print "%d craw failed : %s" % (count, new_url)
        self.outputer.output()
Exemple #4
0
class SpiderMain(object):
    def __init__(self):
        self.urls = UrlManager()
        self.downloader = HtmlDownloader()
        self.parser = HtmlParser()
        self.output = HtmlOutputer()

    def craw(self, root_url, page_amount=5, time_sleep=None):
        count = 1
        # 添加第一个待爬取url
        self.urls.add_new_url(root_url)
        # 如果集合中有url, 就取出一个url 请求, 没有链接则跳出。
        while self.urls.has_new_url():
            try:
                # 开始爬取
                new_url = self.urls.get_new_url()
                print(f'craw{count}:{new_url}')
                # 请求url, 返回html
                html_content = self.downloader.download(new_url)
                # xpath 解析html,得到需要的数据
                new_urls, new_data = self.parser.parse(html_content)
                # 一个词条页面上关联的a 链表列表加入到url 管理器中待爬取
                self.urls.add_new_urls(new_urls)
                self.output.collect_data(new_url, new_data)
                count += 1
                if count > page_amount:
                    break

                time.sleep(2)
            except Exception as e:
                print(e)
                print(f'抓取失败:{new_url}')
        self.output.output_html()
Exemple #5
0
 def __init__(self):
     # URL 管理器
     # self.urls = UrlManager.UrlManager()
     self.urls = UrlManager()
     # URL 下载器
     # self.downloader = HtmlDownloader.HtmlDownloader()
     self.downloader = HtmlDownloader()
     # URL 解析器
     # self.parser = html_parser.HtmlParser()
     self.parser = HtmlParser()
     # self.outputer = html_outputer.HtmlOutputer()
     self.outputer = HtmlOutputer()
Exemple #6
0
class SpiderMain():
    def __init__(self):
        # URL 管理器
        # self.urls = UrlManager.UrlManager()
        self.urls = UrlManager()
        # URL 下载器
        # self.downloader = HtmlDownloader.HtmlDownloader()
        self.downloader = HtmlDownloader()
        # URL 解析器
        # self.parser = html_parser.HtmlParser()
        self.parser = HtmlParser()
        # self.outputer = html_outputer.HtmlOutputer()
        self.outputer = HtmlOutputer()

    def craw(self, root_url):
        count = 1
        originSet = set()
        originSet.add(root_url)
        self.urls.add_new_urls(originSet)
        while self.urls.has_new_rul():
            try:
                new_url = self.urls.get_new_url()
                print "craw %d : %s" % (count, new_url)
                html_cont = self.downloader.downloader(new_url)

                # 输出信息
                downStat = "ERROR"
                if html_cont != None:
                    downStat = "SUCCESS"
                    print "[Page ID : %d downloader %s!]" % (count, downStat)

                new_urls, new_data = self.parser.parser(new_url, html_cont)
                # print "\nnew_urls[%s], new_data[%s]" % (new_urls, new_data)

                self.urls.add_new_urls(new_urls)
                self.outputer.collect_data(new_data)

                if count == 15:
                    break
                count = count + 1
            except Exception as err:
                print "craw failed! ERROR infomation : %s" % err
        self.outputer.output_html()
Exemple #7
0
class SpiderMain(object):
    def __init__(self):
        self.urls = UrlManager()
        self.downlaoder = HtmlDownlaoder()
        self.parser = HtmlParser()
        self.outputer = HtmlOutputer()

    def craw(self, root_url):
        count = 1
        # 把根url 传入url管理列表
        self.urls.add_url(root_url)

        # 页面爬取循环程序
        while self.urls.has_new_url():
            try:
                # 获取一个待爬取的url
                new_url = self.urls.get_new_url()
                print('craw %d: %s' % (count, new_url))

                # 下载该url爬取context
                html_cont = self.downlaoder.download(new_url)

                # 通过解析器,解析该url下载到的内容,获取新的 new_urls 和 新的 data
                new_urls, new_data = self.parser.parse(new_url, html_cont)

                # 把获取到 新的url添加到新的url管理器,
                self.urls.add_new_urls(new_urls)

                # 把获取的新的data添加到新的数据处理器中
                self.outputer.collect_data(new_data)

                if count == 100:
                    break

                count += 1
            except Exception as e:
                print('craw failed')

        self.outputer.output_html()
Exemple #8
0
class SpiderMain(object):
    def __init__(self):
        self.urls = UrlManager()
        self.downloader = HtmlDownloader()
        self.parser = HtmlParser()
        self.outputer = HtmlOutputer()

    def craw(self, root_url):
        count = 1
        self.urls.add_new_url(root_url)
        while self.urls.has_new_url():
            try:
                new_url = self.urls.get_new_url()  # 获取新url
                html_cont = self.downloader.download(new_url)  # 下载url内容
                new_urls, new_data = self.parser.parse(new_url,
                                                       html_cont)  # 解析url内容
                self.urls.add_new_urls(new_urls)  # 将解析到的新url存入url管理器
                self.outputer.collect_data(new_data)  # 收集解析到的数据
                if count == 200:
                    break
                count = count + 1
            except:
                print("craw failed")
        self.outputer.output_html()
Exemple #9
0
class SpiderMain(object):
    """docstring for SpiderMain"""
    def __init__(self):
        self.urlManage = UrlManage()
        self.downloader = HtmlDownloader()
        self.parser = HtmlParser()
        self.outputer = HtmlOutputer()
    def craw(self,url):
        self.urlManage.add_new_url(url)
        
        count = 1
        while self.urlManage.has_new_url():
            url = self.urlManage.get_new_url()
            print '%dth page,address:%s' % (count,url)
            html_content = self.downloader.downloadPage(url)
            new_urls,new_data = self.parser.parse(html_content,url)
            self.urlManage.add_new_urls(new_urls)
            self.outputer.collect_data(new_data)

            if count == 10:
                break

            count = count + 1 
        self.outputer.output_html()
class SpiderMain():
    """爬虫程序主模块"""
    def __init__(self):
        """构造函数,初始化属性"""
        self.urls = UrlManager()
        self.log = MyLog("spider_main", "logs")
        self.downloader = HtmlDownloader()
        self.parser = HtmlParser()
        self.outputer = HtmlOutputer()
        #self.util=utill.DBConn()

    def craw(self, root_url):
        """爬虫入口函数"""
        areas = {
            "gulou": 100,
            "jianye": 72,
            "qinhuai": 100,
            "xuanwu": 67,
            "yuhuatai": 32,
            "qixia": 62,
            "baijiahu": 33,
            "chalukou1": 26,
            "jiangningqita11": 3,
            "dongshanzhen": 29,
            "jiangningdaxuecheng": 15,
            "jiulonghu": 12,
            "jiangjundadao11": 22,
            "kexueyuan": 9,
            "qilinzhen": 42,
            "tiexinqiao": 9,
            "pukou": 100,
            "liuhe": 1,
        }

        #areas = {"gulou":1}

        #1、抓取所有二手房详情界面链接,并将所有连接放入URL管理模块
        for area, pg_sum in areas.items():
            for num in range(1, pg_sum + 1):
                #1.1 拼接页面地址: https://nj.lianjia.com/ershoufang/gulou/pg2/
                pg_url = root_url + area + "/pg" + str(num) + "/"
                self.log.logger.info("1.1 拼接页面地址:" + pg_url)
                print("1.1 拼接页面地址:" + pg_url)
                #1.2 启动下载器,下载页面.
                try:
                    html_cont = self.downloader.download(pg_url)
                except Exception as e:
                    self.log.logger.error("1.2 下载页面出现异常:" + repr(e))
                    time.sleep(60 * 30)
                else:
                    #1.3 解析PG页面,获得二手房详情页面的链接,并将所有链接放入URL管理模块
                    try:
                        ershoufang_urls = self.parser.get_erhoufang_urls(
                            html_cont)
                    except Exception as e:
                        self.log.logger.error("1.3 页面解析出现异常:" + repr(e))
                    else:
                        self.urls.add_new_urls(ershoufang_urls)
                        #暂停0~3秒的整数秒,时间区间:[0,3]
                        time.sleep(random.randint(0, 3))

        time.sleep(60 * 20)
        #2、解析二手房具体细心页面
        id = 1
        stop = 1
        while self.urls.has_new_url():
            #2.1 获取url
            try:
                detail_url = self.urls.get_new_url()
                self.log.logger.info("2.1 二手房页面地址:" + detail_url)
                print("2.1 二手房页面地址:" + detail_url)
            except Exception as e:
                print("2.1 拼接地址出现异常")
                self.log.logger.error("2.1 拼接地址出现异常:" + detail_url)

            #2.2 下载页面
            try:
                detail_html = self.downloader.download(detail_url)
            except Exception as e:
                self.log.logger.error("2.2 下载页面出现异常:" + repr(e))
                self.urls.add_new_url(detail_url)
                time.sleep(60 * 30)
            else:
                #2.3 解析页面
                try:
                    ershoufang_data = self.parser.get_ershoufang_data(
                        detail_html, id)
                except Exception as e:
                    self.log.logger.error("2.3 解析页面出现异常:" + repr(e))
                else:
                    #2.4 输出数据
                    try:
                        self.outputer.collect_data(ershoufang_data)
                    except Exception as e:
                        self.log.logger.error("2.4 输出数据出现异常:" + repr(e))
                    else:
                        print(id)
                        id = id + 1
                        stop = stop + 1
                        #暂停0~3秒的整数秒,时间区间:[0,3]
                        time.sleep(random.randint(0, 3))
                        if stop == 2500:
                            stop = 1
                            time.sleep(60 * 20)
Exemple #11
0
 def __init__(self):
     self.urls = UrlManager()
     self.downloader = HtmlDownloader()
     self.parser = HtmlParser()
     self.outputer = HtmlOutputer()
Exemple #12
0
class GrabMain(object):
    def __init__(self, url):
        self.root_url = url
        self.urlManager = UrlManager()
        self.dLoader = HtmlDLoader()
        self.contParser = HtmlParser()
        self.contOutputer = HtmlOutputer()
        pass

    def grabText(self):
        if self.root_url is None:
            return
        self.urlManager.add_new_next_url(self.root_url)
        self.contParser.parser_set(None, None, None, None, None)
        while self.urlManager.get_new_next_count():
            try:
                new_url = self.urlManager.get_new_next_url()
                html_cont = self.dLoader.download(new_url)
                urls, nexts = self.contParser.parser_text_urls(html_cont)
                self.urlManager.add_new_next_urls(nexts)
                self.urlManager.add_new_urls(urls)
            except:
                print "url is error."

        pool = threadpool.ThreadPool(10)
        requests = threadpool.makeRequests(self.thread_grabText,
                                           self.urlManager.new_urls)
        [pool.putRequest(req) for req in requests]
        pool.wait()

    def thread_grabText(self, url):
        try:
            print "curr url is %s." % url
            html_cont = self.dLoader.download(url)
            title, cont = self.contParser.parser_text_cont(html_cont)
            self.contOutputer.output_cont(title, cont)
        except:
            print "url is %s, error." % url

    def grabImgs(self):
        if self.root_url is None:
            return None
        self.urlManager.add_new_next_url(self.root_url)
        self.contParser.parser_set(None, None, None, None, None)
        while self.urlManager.get_new_next_count():
            try:
                new_url = self.urlManager.get_new_next_url()
                html_cont = self.dLoader.download(new_url)
                urls, nexts = self.contParser.parser_text_urls(html_cont)
                self.urlManager.add_new_next_urls(nexts)
                self.urlManager.add_new_urls(urls)
            except:
                print "url is error."

        pool = threadpool.ThreadPool(10)
        requests = threadpool.makeRequests(self.thread_grabImg,
                                           self.urlManager.new_urls)
        [pool.putRequest(req) for req in requests]
        pool.wait()

    def thread_grabImg(self, url):
        try:
            print "curr url is %s." % url
            html_cont = self.dLoader.download(url)
            title, links = self.contParser.parser_img_cont(html_cont)
            if links is None or len(links) == 0:
                print "url is %s, not src." % url
                return None

            if title is None:
                title = time.time()
            try:
                if not os.path.isdir(title):
                    os.mkdir(title)
            except:
                title = time.time()
                if not os.path.isdir(title):
                    os.mkdir(title)

            params = []
            index = 0
            for link in links:
                params.append(([title, link, index], None))
                index += 1

            pool = threadpool.ThreadPool(12)
            requests = threadpool.makeRequests(self.contOutputer.output_img,
                                               params)
            [pool.putRequest(req) for req in requests]
            pool.wait()
        except:
            print "url is %s, error." % url
Exemple #13
0
 def __init__(self):
     self.urlManage = UrlManage()
     self.downloader = HtmlDownloader()
     self.parser = HtmlParser()
     self.outputer = HtmlOutputer()