Ejemplo n.º 1
0
    def get_lunbo(self):
        '''
        财经版面
        :return:
        '''
        url = 'http://www.news.cn/fortune/'
        html = requests.get(url, headers=self.get_caijing_header())
        html.encoding = 'utf-8'

        html = etree.HTML(html.text)
        urls = html.xpath('//div[@class="swiper-slide"]/a/@href')

        year = arrow.now().date().year

        news_list = []

        for url in urls:
            if str(year)  in url:
                log('需要访问的URL 轮播图', url)
                find_one = self.mgr.find_one('url', url)
                if find_one is not None:
                    log_line('该URL已经存在 无需请求')
                    log(url)
                    continue
                news = self.get_iteminfo(url)
                if news == 'timeout' or news == 'error':
                    continue
                news_list.append(news)
        return news_list
Ejemplo n.º 2
0
    def parser_data(self, url):
        '''
        请求每一个新闻详情
        :param url:
        :return:
        '''
        t_sleep()
        log('当前访问的URL', url)

        try:
            html = requests.get(url, headers=self.get_news_header(), timeout=3)
            html.encoding = 'utf-8'
        except Exception as e:
            log_line('访问出错')
            print(e)
            self.__class__.retry = 1
            return 'timeout', 'timeout'

        if html.status_code != 200:
            return 'error', 'error'

        response = etree.HTML(html.text)

        con_list = response.xpath(
            '//div[@class="ldContent"]/descendant-or-self::*/text()')
        content = ''.join(con_list).strip()

        date = response.xpath('//div[@class="ldDate"]/text()')[0]
        date = date.split(':')[1]
        # log('内容', content)
        return date, content
Ejemplo n.º 3
0
    def get_content(self, url):
        '''
        请求每一个新闻详情
        :param url:
        :return:
        '''
        t_sleep()
        log('当前访问的URL', url)

        try:
            html = requests.get(url, headers=self.get_news_header(), timeout=3)
            html.encoding = 'utf-8'
        except Exception as e:
            log_line('访问出错')
            print(e)
            self.__class__.retry = 1

            return 'timeout'

        if html.status_code != 200:
            log('访问的URL出错!!!', url)
            return 'error'

        response = etree.HTML(html.text)

        con_list = response.xpath(
            '//div[@class="union"]/descendant-or-self::*/text()')
        return ''.join(con_list).strip()
Ejemplo n.º 4
0
    def get_newsinfo(self, urls):
        '''
        访问每一条新闻详情
        :param newslist: 新闻链接集合
        :return: 新闻model
        '''
        for url in urls:
            t_sleep()
            log('当前访问的URL', url)

            try:
                html = requests.get(url, timeout=3)
                html.encoding = 'utf-8'
            except Exception as e:
                log_line('访问出错')
                print(e)
                self.__class__.retry = 1

                continue

            if html.status_code != 200:
                continue

            response = etree.HTML(html.text)

            item = self.parse_item(response, html.url)
            MogoMgr().insert(item)
Ejemplo n.º 5
0
    def get_content(self, url):
        '''
        请求每一个新闻详情
        :param url:
        :return:
        '''
        t_sleep()
        log('当前访问的URL', url)

        try:
            html = requests.get(url, headers=self.get_news_header(), timeout=3)
            html.encoding = 'utf-8'
        except Exception as e:
            log_line('访问出错')
            print(e)
            self.__class__.retry = 1

            return 'timeout'

        if html.status_code != 200:
            return 'error'

        response = etree.HTML(html.text)

        return self.parse_item(response)
Ejemplo n.º 6
0
    def get_iteminfo(self, url):
        '''
        访问每一条新闻详情
        :param itemlist: 新闻链接集合
        :return: 新闻model
        '''
        t_sleep()

        log('当前访问的URL', url)


        try:
            html = requests.get(url, headers=self.get_news_header(), timeout=3)
            html.encoding = 'utf-8'
        except Exception as e:
            log_line('访问出错')
            print(e)
            return 'timeout'

        if html.status_code != 200:
            log('访问的URL出错!!!', url)
            return 'error'

        response = etree.HTML(html.text)
        title, date, content = self.parse_item(response)

        news = News(title=title, date=date, content=content, url=url)
        news.spider_name = 'xinhua'
        return news
Ejemplo n.º 7
0
    def get_newsinfo(self, url):
        '''
        请求每一个新闻详情
        :param url:
        :return:
        '''
        t_sleep()
        log('当前访问的URL', url)

        try:
            html = requests.get(url, headers=self.get_news_header(), timeout=3)
            html.encoding = 'utf-8'
        except Exception as e:
            log_line('访问出错')
            print(e)
            self.__class__.retry = 1

            return 'timeout'

        if html.status_code != 200:
            log_line('请求状态不是200')
            return 'error'

        response = etree.HTML(html.text)
        self.parse_item(response)

        title, date, content = self.parse_item(response)
        news = News(title=title, date=date, content=content, url=url)
        news.spider_name = 'bjjrj'
        return news
Ejemplo n.º 8
0
    def re_send(cls):

        if cls.retry != -1 and cls.retry_flag == -1:
            log_line('部分新闻访问出错 再次进行访问')
            log('再次运行的爬虫类型', cls)
            cls.retry_flag = 1
            cls().run()
Ejemplo n.º 9
0
    def get_newsinfo(self, url):
        '''
        请求每一个新闻详情
        :param url:
        :return:
        '''
        t_sleep()

        try:
            html = requests.get(url, headers=self.get_news_header(), timeout=3)
            html.encoding = 'gbk'
        except Exception as e:
            log_line('访问出错')
            print(e)
            return 'timeout'




        response = etree.HTML(html.text)
        log('当前访问的URL', url)

        title, date, content = self.parse_item(response)
        news = News(title=title, date=date, content=content, url=url)
        return news
Ejemplo n.º 10
0
    def get_itemlist(self, page='1'):
        '''
        获取新华财经 所有新闻详情
        :return: 返回新闻model
        '''

        # 新华财经  -  新闻列表
        url = 'http://qc.wa.news.cn/nodeart/list?nid=11147664&pgnum={0}&cnt=16&tp=1&orderby=1'.format(page)

        html = requests.get(url, headers=self.get_newlist_header())
        items = json.loads(html.text[1:-1])
        items = items['data']['list']

        news_list = []

        for item in items:
            # 避免重复请求
            find_one = self.mgr.find_one('url', item['LinkUrl'])
            if find_one is not None:
                log_line('该URL已经存在 无需请求')
                log(item['LinkUrl'])
                continue

            news = self.get_iteminfo(item['LinkUrl'])
            if news == 'timeout' or news == 'error':
                continue
            news_list.append(news)
        return news_list
Ejemplo n.º 11
0
    def run(self):
        log_line('ZqrbSpider 启动!!!')

        url = self.get_start_url()
        urls = self.get_html(url)
        news_list = self.send_request(urls)

        for news in news_list:
            self.mgr.insert(news)

        self.__class__().re_send()
Ejemplo n.º 12
0
    def run(self):
        log_line('StcnSpider 启动!!!')

        url = 'http://www.stcn.com/'

        urls = self.get_html(url)
        news_list = self.send_request(urls)

        for news in news_list:
            self.mgr.insert(news)

        self.__class__().re_send()
Ejemplo n.º 13
0
    def run(self):
        log_line('ShangHaiSpider 启动!!!')

        url = 'http://www.shanghai.gov.cn/nw2/nw2314/nw2319/nw41893/index.html'
        urls = self.get_html(url)

        news_list = self.send_request(urls)

        for news in news_list:
            self.mgr.insert(news)

        self.__class__().re_send()
Ejemplo n.º 14
0
 def send_request(self, urls):
     news_list = []
     for url in urls:
         # 避免重复请求
         find_one = self.mgr.find_one('url', url)
         if find_one is not None:
             log_line('该URL已经存在 无需请求')
             log(url)
             continue
         news = self.get_newsinfo(url)
         news_list.append(news)
     return news_list
Ejemplo n.º 15
0
    def send_request(self, urls):

        for url in urls:
            # 避免重复请求
            find_one = self.mgr.find_one('url', url)
            if find_one is not None:
                log_line('该URL已经存在 无需请求')
                log(url)
                continue
            date, content = self.parser_data(url)
            if content in ('error', 'timeout'):
                continue
            self.update_news(url, content, date)
Ejemplo n.º 16
0
    def send_request(self, urls):

        for url in urls:
            # 避免重复请求
            find_one = self.mgr.find_one('url', url)
            if find_one is not None:
                log_line('该URL已经存在 无需请求')
                log(url)
                continue
            content = self.get_content(url)
            if content == 'error' or content == 'timeout':
                continue
            self.update_content(url, content)
Ejemplo n.º 17
0
    def run(self):
        log_line('CbrcSpider 启动!!!')

        urls = self.get_html(self.start_url)
        self.send_request(urls)

        for news in self.newslist:
            find_one = self.mgr.find_one('url', news.url)
            if find_one is not None:
                log_line('该URL已经存在 无需写入')
                log(news.url)
                continue
            self.mgr.insert(news)

        self.__class__().re_send()
Ejemplo n.º 18
0
    def run(self):
        log_line('CsSpider 启动!!!')

        start_urls = [
            'http://www.cs.com.cn/',
        ]

        for url in  start_urls:
            urls = self.get_html(url)
            news_list = self.send_request(urls)

            for news in news_list:
                self.mgr.insert(news)

        self.__class__().re_send()
Ejemplo n.º 19
0
    def send_request(self, urls):
        for url in urls:
            # 避免重复请求
            find_one = self.mgr.find_one('url', url)
            if find_one is not None:
                log_line('该URL已经存在 无需请求')
                log(url)
                continue

            content = self.get_content(url)
            if content == 'timeout' or 'error':
                continue

            for news in self.newslist:
                if news.url == url:
                    news.content = content
Ejemplo n.º 20
0
    def run(self):

        log_line('CctvSpider 启动!!!')

        urls = []
        url = 'http://jingji.cctv.com/'
        urls_1 = self.get_html(url)
        urls_2 = self.get_jsondata()
        urls.extend(urls_1)
        urls.extend(urls_2)
        urls = set(urls)

        news_list = self.send_request(urls)

        for news in news_list:
            self.mgr.insert(news)

        self.__class__().re_send()
Ejemplo n.º 21
0
    def run(self):
        log_line('XinHuaSpider 启动!!!')

        news_list = []
        # 对财经版面的前两页数据进行爬取
        news_list_1 = self.get_itemlist(page='1')
        news_list_2 = self.get_itemlist(page='2')

        news_list_3 = self.get_lunbo()
        news_list_4 = self.get_money()
        news_list.extend(news_list_1)
        news_list.extend(news_list_2)
        news_list.extend(news_list_3)
        news_list.extend(news_list_4)

        for news in news_list:
            self.mgr.insert(news)

        self.__class__().re_send()
Ejemplo n.º 22
0
    def run(self):
        log_line('BjjrjSpider 启动!!!')

        urls = []
        url = 'http://www.bjjrj.gov.cn/zcfg/c19-list-1.html'
        urls_1 = self.get_html(url)
        url = 'http://www.bjjrj.gov.cn/zyzc/c138-list-1.html'
        urls_2 = self.get_html(url)

        urls.extend(urls_1)
        urls.extend(urls_2)

        news_list = self.send_request(urls)

        log_line(len(news_list))

        for news in news_list:
            self.mgr.insert(news)

        self.__class__().re_send()
Ejemplo n.º 23
0
    def run(self):

        log_line('CircSpider 启动!!!')

        for url in self.start_urls:
            self.get_html(url)
            self.send_request(self.get_newsUrls())

            # for news in self.newslist:
            #     log(news.url, news.content)
            #
            for news in self.newslist:
                find_one = self.mgr.find_one('url', news.url)
                if find_one is not None:
                    log_line('该URL已经存在 无需写入')
                    log(news.url)
                    continue
                self.mgr.insert(news)

        self.__class__().re_send()
Ejemplo n.º 24
0
    def send_request(self, urls):
        news_list = []
        for url in urls:
            # 避免重复请求
            find_one = self.mgr.find_one('url', url)
            if find_one is not None:
                log_line('该URL已经存在 无需请求')
                log(url)
                continue

            news = self.get_newsinfo(url)

            if news == 'error':
                log('访问的新闻不存在 继续访问下一个URL')
                continue
            if news == 'timeout':
                log('访问的新闻超时 暂时跳过')
                continue

            news_list.append(news)
        return news_list
Ejemplo n.º 25
0
    def get_money(self):
        '''
        金融版面
        :return:
        '''
        url = 'http://www.xinhuanet.com/money/index.htm'
        html = requests.get(url, headers=self.get_news_header())
        html.encoding = 'utf-8'

        html = etree.HTML(html.text)

        urls_all = []

        urls_1 = html.xpath('//li[@class="clearfix"]/h3/a/@href')

        # 只对新闻列表进行处理
        urls_2 = html.xpath('//li[@class="imp"]/a/@href')
        urls_3 = html.xpath('//div[@class="swiper-slide"]/a/@href')

        urls_all.extend(urls_1)
        urls_all.extend(urls_2)
        urls_all.extend(urls_3)

        # log(len(urls_all), urls_all)

        news_list = []

        for url in urls_all:
            # 避免重复请求
            find_one = self.mgr.find_one('url', url)
            if find_one is not None:
                log_line('该URL已经存在 无需请求')
                log(url)
                continue

            news = self.get_iteminfo(url)
            if news == 'timeout' or news == 'error':
                continue
            news_list.append(news)
        return news_list
Ejemplo n.º 26
0
    def run(self):
        log_line('PbcSpider 启动!!!')

        # 公告信息
        dest_url = 'http://www.pbc.gov.cn/rmyh/105208/index.html'
        self.send(dest_url, self.parser_gonggao_list, self.parse_gonggao_item)

        # 法律法规
        dest_url = 'http://www.pbc.gov.cn/tiaofasi/144941/index.html'
        self.send(dest_url, self.parser_falvfagui, self.parser_common_item)


        # 货币政策  暂未完成
        # dest_url = 'http://www.pbc.gov.cn/rmyh/105145/index.html'
        # self.send(dest_url, self.parser_xindai, self.parser_common_item)


        # 信贷政策
        dest_url = 'http://www.pbc.gov.cn/jinrongshichangsi/147160/147289/index.html'
        self.send(dest_url, self.parser_xindai, self.parser_common_item)


        self.__class__().re_send()
Ejemplo n.º 27
0
    def send_request(self, urls, parser_item_fuc):
        '''
        用于请求每一个具体的新闻链接
        :param urls:   具体新闻URL
        :param parser_item_fuc: 用于解析每一个新闻详情的函数
        :return: 返回解析好的News类型列表
        '''
        news_list = []
        for url in urls:
            # 避免重复请求
            find_one = self.mgr.find_one('url', url)
            if find_one is not None:
                log_line('该URL已经存在 无需请求')
                log(url)
                continue

            news = self.get_newsinfo(url, parser_item_fuc)

            if news == 'error' or news == 'timeout':
                continue

            news_list.append(news)
        return news_list
Ejemplo n.º 28
0
    def run(self):

        log_line('FangChanSpider 启动!!!')

        start_urls = [
            'http://www.fangchan.com/policy/28/',
            'http://www.fangchan.com/plus/nlist.php?tid=2&tags=%E5%8E%9F%E5%88%9B',
            'http://www.fangchan.com/plus/nlist.php?tid=2&column=%E5%AE%8F%E8%A7%82',
            'http://www.fangchan.com/news/6/',
            'http://www.fangchan.com/news/1/',
            'http://www.fangchan.com/news/9/',
            'http://www.fangchan.com/news/5/',
            'http://www.fangchan.com/news/7/',
            'http://www.fangchan.com/news/4/',
        ]

        for url in start_urls:
            urls = self.get_html(url)
            news_list = self.send_request(urls)

            for news in news_list:
                self.mgr.insert(news)

        self.__class__().re_send()
Ejemplo n.º 29
0
    def get_html(self, dest_url):
        '''
        解码PBC的JavaScript脚本 并再次访问获取原始HTML
        :param url: 需要访问的PBC链接
        :return: HTML源码  requests中的 response 类型
        '''



        r = requests.session()



        # dest_url = 'http://www.pbc.gov.cn/rmyh/105208/index.html'
        # dest_url = 'http://www.pbc.gov.cn/tiaofasi/144941/index.html'
        # dest_url = 'http://www.pbc.gov.cn/rmyh/105145/index.html'
        # dest_url = 'http://www.pbc.gov.cn/jinrongshichangsi/147160/147289/index.html'


        # 利用session保存cookie信息,第一次请求会设置cookie类似{'wzwsconfirm': 'ab3039756ba3ee041f7e68f634d28882', 'wzwsvtime': '1488938461'},与js解析得到的cookie合起来才能通过验证
        # r = requests.session()
        content = r.get(dest_url).content
        # 获取页面脚本内容
        re_script = re.search(r'<script type="text/javascript">(?P<script>.*)</script>', content.decode('utf-8'),
                              flags=re.DOTALL)
        # 用点匹配所有字符,用(?P<name>...)获取:https://docs.python.org/3/howto/regex.html#regex-howto
        # cheatsheet:https://github.com/tartley/python-regex-cheatsheet/blob/master/cheatsheet.rst
        script = re_script.group('script')
        script = script.replace('\r\n', '')
        # 在美化之前,去掉\r\n之类的字符才有更好的效果
        res = jsbeautifier.beautify(script)
        # 美化并一定程度解析js代码:https://github.com/beautify-web/js-beautify
        with open('x.js', 'w') as f:
            f.write(res)
        # 写入文档进行查看分析

        jscode_list = res.split('function')
        var_ = jscode_list[0]
        var_list = var_.split('\n')
        template_js = var_list[3]  # 依顺序获取,亦可用正则
        template_py = js2py.eval_js(template_js)
        # 将所有全局变量插入第一个函数变为局部变量并计算
        function1_js = 'function' + jscode_list[1]
        position = function1_js.index('{') + 1
        function1_js = function1_js[:position] + var_ + function1_js[position:]
        function1_py = js2py.eval_js(function1_js)
        cookie1 = function1_py(str(template_py))  # 结果类似'NA=='
        # 保存得到的第一个cookie
        cookies = {}
        cookies['wzwstemplate'] = cookie1
        # 对第三个函数做类似操作
        function3_js = 'function' + jscode_list[3]
        position = function3_js.index('{') + 1
        function3_js = function3_js[:position] + var_ + function3_js[position:]
        function3_py = js2py.eval_js(function3_js)
        middle_var = function3_py()  # 是一个str变量,结果类似'WZWS_CONFIRM_PREFIX_LABEL4132209'
        cookie2 = function1_py(middle_var)
        cookies['wzwschallenge'] = cookie2
        # 关于js代码中的document.cookie参见 https://developer.mozilla.org/zh-CN/docs/Web/API/Document/cookie
        dynamicurl = js2py.eval_js(var_list[0])

        # 利用新的cookie对提供的动态网址进行访问即是我们要达到的内容页面了
        r.cookies.update(cookies)
        # content = r.get(self.host_url + dynamicurl).content.decode('utf-8')


        try:
            content = r.get(self.host_url + dynamicurl, timeout=3)
            content.encoding = 'utf-8'
        except Exception as e:
            log_line('访问出错')
            print(e)
            self.__class__.retry = 1

            return 'timeout'

        return content
Ejemplo n.º 30
0
    def run(self):
        log_line('JingJiSpider 启动!!!')

        news_list = self.get_newslist()
        self.get_newsinfo(news_list)