Esempio n. 1
0
class SpiderMain(object):
    def __init__(self):
        # 初始化程序
        self.download = Downloader()
        self.parser = HtmlParser()
        self.mysql = Mysqldb()

    def run(self, url, database):
        response = self.download.download(url)
        self.parser.parser(response, database)
Esempio n. 2
0
class SpiderMain(object):
    def __init__(self):
        # 初始化程序
        self.download = Downloader()
        self.parser = HtmlParser()
        self.save = SaveData()
        self.workbook = Workbook()
        self.ch = Choice()
        print('初始化完成...')

    def run(self):
        while True:
            try:
                p = int(input('想要爬多少页的数据?' + '\n'))
                break
            except ValueError:
                print('输入错误!请输入数字')
        page = p + 1
        print("================================")
        print(' A.原创发布区     B.精品软件区    ')
        print(' C.脱壳破解区     D.移动安全区    ')
        print(' E.病毒分析区     F.编程语言区    ')
        print(' G.软件调试区     H.动画发布区    ')
        print(' I.逆向资源区     J.安全工具区    ')
        print(' K.招聘求职区                    ')
        print("================================")
        while True:
            choice = input("选择爬取的专区,输入 Q 退出程序(输入的字母必须大写):")
            half_url, name = self.ch.make_the_arrg(choice)
            if name != 'Error':
                break
        print(half_url + '\n' + name)
        self.save.createfile(name)
        for i in range(1, page):
            url = half_url + str(i) + '.html'
            response = self.download.download(url)
            self.parser.parser(response, name)
            sleep = random.randint(2, 10)
            print('爬取第' + str(i) + '页完成,程序休息' + str(sleep) + '秒')
            time.sleep(sleep)  # 程序睡眠
            if i != page - 1:
                print('-----------------------------')
                print('          下一页              ')
                print('-----------------------------')
        print('数据写入完成,正在进行数据去重...')
        self.save.delete_same_data()
        try:
            self.workbook.save('将csv的数据导入此表.xlsx')
        except:
            print('创建xlsx文件失败,请手动创建')
        print('程序运行完毕')
Esempio n. 3
0
class SpiderMan(object):
    def __init__(self):
        self.manager = UrlManager()
        self.parser = HtmlParser()
        self.downloader = HtmlDownloader()
        self.output = DataOutput()

    def crawl(self, root_url):
        """
        程序主逻辑
        :param root_url: 入口 url
        :return:
        """
        self.manager.add_new_url(root_url)
        while self.manager.has_new_url() and self.manager.old_url_size() < 20:
            try:
                new_url = self.manager.get_new_url()
                html = self.downloader.downloader(new_url)
                new_urls, data = self.parser.parser(new_url, html)
                self.manager.add_new_urls(new_urls)
                self.output.output_txt(data)
                print(data)
                print("爬取了{}条链接".format(self.manager.old_url_size()))
            except Exception as e:
                print("爬取失败", e)
Esempio n. 4
0
class SpiderMan(object):
    def __init__(self):
        self.manager = UrlManager()
        self.downloader = HtmlDownloader()
        self.parser = HtmlParser()
        self.output = DataOutput()

    def crawl(self, root_url):
        self.manager.add_new_url(root_url)

        while (self.manager.has_new_urls()
               and self.manager.old_url_size() < 100):
            #try:
            # 获取新的url
            new_url = self.manager.get_new_url()
            # 下载器下载网页
            html = self.downloader.download(new_url)
            # 解析器抽取网页数据
            new_urls, data = self.parser.parser(new_url, html)
            # 添加UR管理器
            self.manager.add_new_urls(new_urls)
            # 数据存储文件
            self.output.store_data(data)
            print("已经抓取 %s 个链接" % self.manager.old_url_size())
            #except Exception, e:
            #    print("crawl failded", e)
        self.output.out_put_html()
Esempio n. 5
0
class SpiderMan(object):
    def __init__(self):
        self.manger = UrlManager()
        self.downloader = HtmlDownloader()
        self.parser = HtmlParser()
        self.output = OutputData()

    def crawl(self, root_url):
        """
        主程序
        :param root_url: 入口 URL
        :return:
        """
        self.manger.add_new_url(root_url)
        while self.manger.has_new_url() and self.manger.old_urls_size() < 5:
            new_url = self.manger.get_new_url()
            html = self.downloader.downloader(new_url)
            next_url, data = self.parser.parser(new_url, html)
            self.manger.add_new_url(next_url)
            self.output.outputTxt(data)
Esempio n. 6
0
class SpiderMan(object):
    def __init__(self):
        #开启的线程数目
        self.pcount = 1
        #结果输出队列
        self.dqueue = queue.Queue()
        #错误信息输出队列
        self.equeue = queue.Queue()
        self.manager = UrlManager()
        self.downloader = HtmlDownloader()
        self.parser = HtmlParser()
        self.output = DataOutput()
        # self.proxies = getProxy()
        self.proxies = getFromPool2()
        self.inactivepro = []
        self.count = 0
        self.sumSuccess = 0
        self.sumFail = 0
        self.updating = False
        #self.proxies = ['http://127.0.0.1:2740']

    def doCrawl(self, new_url):
        try:
            self.pcount += 1
            count = 1
            #随机选取代理IP
            pro = random.choice(self.proxies)
            #pro = 'http://127.0.0.1:2740'
            while (True):
                #HTML下载器下载网页
                html = self.downloader.download(new_url, pro)
                #HTML解析器抽取网页数据
                data = self.parser.parser(new_url, html)
                ## 数据存储器储存文件引起多线程写冲突而废弃
                # self.output.store_data(data)
                #如果遇到机器人检测
                if data == "robot":
                    if count < 6:
                        count = count + 1
                        #加入淘汰机制
                        # self.proxies - self.inactivepro 表现良好
                        # self.proxies and self.inactivepro 一次被墙 待观察
                        # self.inactivepro - self.proxies 两次被墙 暂时退出
                        # none 复活失败,永久退出
                        if (count == 5 and len(self.proxies) > 100):
                            if (self.inactivepro.index(pro) < 0):  #加入观察名单
                                self.inactivepro.append(pro)
                                pro = random.choice(self.proxies)
                            else:  #暂时退出
                                print(str(pro) + " out\n")
                                if (self.proxies.index(pro) >= 0):
                                    self.proxies.remove(pro)
                        continue
                    else:
                        raise Exception("robot check")
                else:
                    break
            # 队列将输出存储起来
            self.dqueue.put(data)
        except Exception as e:
            self.sumFail = self.sumFail + 1
            print(
                "Fail: link %d fail %d times : %s\n" %
                (self.count, self.sumFail, new_url), e.args)
            # 启动激活计划
            if (len(self.proxies) < 200 or len(self.inactivepro) > 500):
                pro = random.choice(self.inactivepro)
                if (not pro is None and self.proxies.index(pro) < 0
                        and self.testIP(pro)):
                    self.proxies.append(pro)
                    print(str(pro) + " in!!!\n")
                # 不管结果如何,都要将pro移除,
                # 判断条件是为了防止多线程并发出现问题
                if (self.inactivepro.index(pro) >= 0):
                    self.inactivepro.remove(pro)
            self.equeue.put([new_url, e.args])
        else:
            self.sumSuccess = self.sumSuccess + 1
            print("Success: link %d success %d times : %s" %
                  (self.count, self.sumSuccess, new_url))
        finally:
            self.pcount -= 1

    def setProxy(self):
        #self.proxies = getProxy()
        self.proxies = getFromPool2()
        self.updating = False

    #输出结果和错误信息
    def outPutData(self):
        while (not self.dqueue.empty()):
            data = self.dqueue.get()
            self.output.store_data(data)
        while (not self.equeue.empty()):
            err = self.equeue.get()
            self.output.store_err(err)

    def testIP(self, pro):
        url = 'https://www.douban.com'
        res = requests.get(url, proxies={'proxy': pro}, timeout=20)
        if (res.status_code == 200):
            return True
        else:
            return False

    def crawl(self):
        threads = []
        preFail = 0
        #跳过之前的url
        for i in range(22350):
            self.manager.has_new_url()
        while (self.manager.has_new_url()):
            try:
                self.count = self.count + 1
                # 启动更新计划
                if self.sumFail - preFail > 46 and not self.updating:
                    self.updating = True
                    print("\n\nstart refreshing proxies\n\n")
                    t = threading.Thread(target=SpiderMan.setProxy,
                                         args=[
                                             self,
                                         ])
                    t.start()
                    threads.append(t)
                    # p = Pool()
                    # result = p.apply_async(getFromPool2, args=())
                    # p.close()
                    #self.proxies = result.get()
                #每50条数据刷新缓冲区和成功率
                if (self.count % 50 == 0 and self.count != 0):
                    preFail = self.sumFail
                    rate = float(self.sumSuccess) / float(self.count - 1)
                    print("Success Rate: %f" % rate)
                    self.output.store_err([str(self.count), str(rate)])
                    self.output.flush()
                #从URL管理器获取新的url
                new_url = self.manager.get_new_url()
                #爬虫主过程(多线程优化)
                if self.pcount < 0:
                    pcount = 0
                else:
                    pcount = self.pcount
                time.sleep(random.random() + pcount / 10)  #随机时间间隔,根据线程数调整速度
                t = threading.Thread(target=SpiderMan.doCrawl,
                                     args=[
                                         self,
                                         new_url,
                                     ])
                t.start()
                threads.append(t)
                #输出结果和错误信息
                self.outPutData()
            except Exception as e:
                print("wired fail")
        [t.join() for t in threads]
        self.outPutData()