Ejemplo n.º 1
0
    def __init__(self):
        self.htmldownload = HtmlDownload()
        self.parse = HtmlParse()
        self.urlmanage = UrlManager()
        self.storedata = StoreData()

        self.province = []
Ejemplo n.º 2
0
class SpiderManager(object):
    def __init__(self):
        self.htmldownload = HtmlDownload()
        self.parse = HtmlParse()
        self.urlmanage = UrlManager()
        self.storedata = StoreData()

        self.province = []

    def crawl(self,url):
        html = self.htmldownload.Download(url)
        urls = self.parse.get_all_url(html)
        self.urlmanage.add_urls(urls)
        i = 0
        while self.urlmanage.has_url():
            url = self.urlmanage.get_url()
            html =self.htmldownload.Download(url)
            data = self.parse.get_province(html)
            self.province.append(data)
            i += 1
            print(i)
        # print(self.province)
        self.storedata.store(self.province,"中国科学院院士信息.json")
        result = self.storedata.analysis(self.province)
        self.storedata.store(result, "省份统计结果.json")
Ejemplo n.º 3
0
 def __init__(self):
     """
     初始化各个模块
     """
     self.manager = UrlManager()
     self.download = HtmlDownload()
     self.parse = ParseData()
     self.ouput = DataOutput()
Ejemplo n.º 4
0
class SpiderMan(object):
    def __init__(self):
        self.manager = UrlManager()
        self.parser = HtmlParser()
        self.downloader = HtmlDownloader()
        self.output = DataOutput()

    def crawl(self, root_url):
        """
        程序主逻辑
        :param root_url: 入口 url
        :return:
        """
        self.manager.add_new_url(root_url)
        while self.manager.has_new_url() and self.manager.old_url_size() < 20:
            try:
                new_url = self.manager.get_new_url()
                html = self.downloader.downloader(new_url)
                new_urls, data = self.parser.parser(new_url, html)
                self.manager.add_new_urls(new_urls)
                self.output.output_txt(data)
                print(data)
                print("爬取了{}条链接".format(self.manager.old_url_size()))
            except Exception as e:
                print("爬取失败", e)
Ejemplo n.º 5
0
class SpiderMan(object):
    def __init__(self):
        self.manager = UrlManager()
        self.downloader = HtmlDownloader()
        self.parser = HtmlParser()
        self.output = DataOutput()

    def crawl(self, root_url):
        self.manager.add_new_url(root_url)

        while (self.manager.has_new_urls()
               and self.manager.old_url_size() < 100):
            #try:
            # 获取新的url
            new_url = self.manager.get_new_url()
            # 下载器下载网页
            html = self.downloader.download(new_url)
            # 解析器抽取网页数据
            new_urls, data = self.parser.parser(new_url, html)
            # 添加UR管理器
            self.manager.add_new_urls(new_urls)
            # 数据存储文件
            self.output.store_data(data)
            print("已经抓取 %s 个链接" % self.manager.old_url_size())
            #except Exception, e:
            #    print("crawl failded", e)
        self.output.out_put_html()
Ejemplo n.º 6
0
def run():
    #数据爬取器
    factory_spider = Spiser()
    #数据解析器
    data_parser = dataParser.DataParser()
    data_manager = DataManager()
    url_manager = UrlManager()
    #获取到三个url列表,均为需要爬取的数据
    url_list, shopurl1_list, shopurl2_list = url_manager.tel_url()
    total_num = len(url_list)
    crawred_url = url_manager.crawred_url()
    company_dataList = []

    for i in range(total_num):
        url = url_list[i]
        shopurl1 = shopurl1_list[i]
        shopurl2 = shopurl2_list[i]

        if url not in crawred_url:
            page_data = factory_spider.get_urlpage(url)
            page_shop1 = factory_spider.get_urlpage(shopurl1)
            page_shop2 = factory_spider.get_urlpage(shopurl2)
            #使用解析器,解析三个页面的数据
            companydata = data_parser.get_company_data(page_data, page_shop1,
                                                       page_shop2, url)
            #将解析后的数据元组保存至列表
            company_dataList.append(companydata)
            time.sleep(1.1)
        # elif url in crawred_url:
        #     print '已经爬取过了',url

        # 将爬取结果保存至本地csv文件,爬5家店铺保存一次
        print '=========', i, '=========='
        if i % 10 == 0 and len(company_dataList) > 0:
            data_manager.save_local_tel(company_dataList)
            company_dataList = []
            time.sleep(10)
Ejemplo n.º 7
0
class SpiderMan():
    """
    爬虫主模块
    """
    def __init__(self):
        """
        初始化各个模块
        """
        self.manager = UrlManager()
        self.download = HtmlDownload()
        self.parse = ParseData()
        self.ouput = DataOutput()

    def start(self, city, job):
        """
        开始爬取
        :return:
        """
        # 创建 csv 文件
        self.ouput.create_csv(city, job)
        # 爬 2页
        position_url = self.manager.get_position_url()
        for pn in range(1, 2):
            """
            买的是动态代理,使用时发现,代理虽然换了,但也常被封,可能是这个代理别人
            也拿去爬拉勾了,所以只要失败就重新换代理再请求
            """
            while True:
                response = self.download.get_html(pn, position_url, city, job)
                if 'true' not in response.text:
                    continue
                else:
                    break

            data = self.parse.get_info(response.text)

            if data == None:  # 编码错误的页跳过
                continue

            if data == []:
                print('\n爬取完毕或拉勾上此城市没有相关的职位!!!')
                break
            self.ouput.write_to_csv(data, city, job)

            print('\r第 {} 已爬取'.format(str(pn)), end='')

        print('\n爬取完毕,正在生成职位信息报表.....')
Ejemplo n.º 8
0
class SpiderMan(object):
    def __init__(self):
        self.manger = UrlManager()
        self.downloader = HtmlDownloader()
        self.parser = HtmlParser()
        self.output = OutputData()

    def crawl(self, root_url):
        """
        主程序
        :param root_url: 入口 URL
        :return:
        """
        self.manger.add_new_url(root_url)
        while self.manger.has_new_url() and self.manger.old_urls_size() < 5:
            new_url = self.manger.get_new_url()
            html = self.downloader.downloader(new_url)
            next_url, data = self.parser.parser(new_url, html)
            self.manger.add_new_url(next_url)
            self.output.outputTxt(data)
Ejemplo n.º 9
0
 def __init__(self):
     self.manager = UrlManager()
     self.parser = HtmlParser()
     self.downloader = HtmlDownloader()
     self.output = DataOutput()
Ejemplo n.º 10
0
def login():
    url = url_for('hello')  # 通过视图的方法名,找到对应的路由,/api/hello/
    url1 = UrlManager.buildUrl('/api')  # /api
    url2 = UrlManager.buildStaticUrl(
        '/css/bootstrap')  # /css/bootstrap?version=20180925
    return 'Hello World,url:%s,url1:%s, url2:%s' % (url, url1, url2)