Example #1
0
class PbcSpider(BaseSpider):

    def __init__(self):
        self.headers = {}
        self.mgr = MogoMgr()
        self.host_url = 'http://www.pbc.gov.cn'


    def get_news_header(self):
        return {
            # 'Host': '',
            'User-Agent': randomUserAgent(),
            'Pragma': 'no-cache',
            'Referer': 'http://www.cnstock.com/',
        }



    def get_html(self, dest_url):
        '''
        解码PBC的JavaScript脚本 并再次访问获取原始HTML
        :param url: 需要访问的PBC链接
        :return: HTML源码  requests中的 response 类型
        '''



        r = requests.session()



        # dest_url = 'http://www.pbc.gov.cn/rmyh/105208/index.html'
        # dest_url = 'http://www.pbc.gov.cn/tiaofasi/144941/index.html'
        # dest_url = 'http://www.pbc.gov.cn/rmyh/105145/index.html'
        # dest_url = 'http://www.pbc.gov.cn/jinrongshichangsi/147160/147289/index.html'


        # 利用session保存cookie信息,第一次请求会设置cookie类似{'wzwsconfirm': 'ab3039756ba3ee041f7e68f634d28882', 'wzwsvtime': '1488938461'},与js解析得到的cookie合起来才能通过验证
        # r = requests.session()
        content = r.get(dest_url).content
        # 获取页面脚本内容
        re_script = re.search(r'<script type="text/javascript">(?P<script>.*)</script>', content.decode('utf-8'),
                              flags=re.DOTALL)
        # 用点匹配所有字符,用(?P<name>...)获取:https://docs.python.org/3/howto/regex.html#regex-howto
        # cheatsheet:https://github.com/tartley/python-regex-cheatsheet/blob/master/cheatsheet.rst
        script = re_script.group('script')
        script = script.replace('\r\n', '')
        # 在美化之前,去掉\r\n之类的字符才有更好的效果
        res = jsbeautifier.beautify(script)
        # 美化并一定程度解析js代码:https://github.com/beautify-web/js-beautify
        with open('x.js', 'w') as f:
            f.write(res)
        # 写入文档进行查看分析

        jscode_list = res.split('function')
        var_ = jscode_list[0]
        var_list = var_.split('\n')
        template_js = var_list[3]  # 依顺序获取,亦可用正则
        template_py = js2py.eval_js(template_js)
        # 将所有全局变量插入第一个函数变为局部变量并计算
        function1_js = 'function' + jscode_list[1]
        position = function1_js.index('{') + 1
        function1_js = function1_js[:position] + var_ + function1_js[position:]
        function1_py = js2py.eval_js(function1_js)
        cookie1 = function1_py(str(template_py))  # 结果类似'NA=='
        # 保存得到的第一个cookie
        cookies = {}
        cookies['wzwstemplate'] = cookie1
        # 对第三个函数做类似操作
        function3_js = 'function' + jscode_list[3]
        position = function3_js.index('{') + 1
        function3_js = function3_js[:position] + var_ + function3_js[position:]
        function3_py = js2py.eval_js(function3_js)
        middle_var = function3_py()  # 是一个str变量,结果类似'WZWS_CONFIRM_PREFIX_LABEL4132209'
        cookie2 = function1_py(middle_var)
        cookies['wzwschallenge'] = cookie2
        # 关于js代码中的document.cookie参见 https://developer.mozilla.org/zh-CN/docs/Web/API/Document/cookie
        dynamicurl = js2py.eval_js(var_list[0])

        # 利用新的cookie对提供的动态网址进行访问即是我们要达到的内容页面了
        r.cookies.update(cookies)
        # content = r.get(self.host_url + dynamicurl).content.decode('utf-8')


        try:
            content = r.get(self.host_url + dynamicurl, timeout=3)
            content.encoding = 'utf-8'
        except Exception as e:
            log_line('访问出错')
            print(e)
            self.__class__.retry = 1

            return 'timeout'

        return content

    def send_request(self, urls, parser_item_fuc):
        '''
        用于请求每一个具体的新闻链接
        :param urls:   具体新闻URL
        :param parser_item_fuc: 用于解析每一个新闻详情的函数
        :return: 返回解析好的News类型列表
        '''
        news_list = []
        for url in urls:
            # 避免重复请求
            find_one = self.mgr.find_one('url', url)
            if find_one is not None:
                log_line('该URL已经存在 无需请求')
                log(url)
                continue

            news = self.get_newsinfo(url, parser_item_fuc)

            if news == 'error' or news == 'timeout':
                continue

            news_list.append(news)
        return news_list


    def get_newsinfo(self, url, parser_item_fuc):
        '''
        请求每一个新闻详情
        '''
        t_sleep()

        log('当前访问的URL', url)


        html = self.get_html(url)
        if html == 'timeout':
            return 'error'

        response = etree.HTML(html.text)
        log('当前访问的URL', url, html.status_code)

        if html.status_code != 200:
            log('访问的URL出错!!!', url)
            return 'error'

        # parser_item_fuc(response)

        title, date, content = parser_item_fuc(response)
        news = News(title=title, date=date, content=content, url=url)
        news.spider_name = 'pbc'
        return news



    def parser_gonggao_list(self, content):
        '''
        公告信息页面的解析
        :param content: 公告信息页的HTML源码 用于提取公告信息
        :return: 返回的是公告信息详情链接
        '''
        html = etree.HTML(content.text)
        doms = html.xpath('//font[@class="newslist_style"]')

        urls = []

        for e in doms:
            # log('标题', e.xpath('./a/text()')[0].strip())
            # log('url', e.xpath('./a/@href')[0].strip())
            # log('日期', e.getnext().xpath('./text()')[0].strip())
            url = self.host_url + e.xpath('./a/@href')[0].strip()

            urls.append(url)
        return urls


    def parser_falvfagui(self, content):
        html = etree.HTML(content.text)
        doms = html.xpath('//td[@class="font14 bgdian"]')

        urls = []

        for e in doms:

            url = self.host_url + e.xpath('./a/@href')[0].strip()
            # log('标题', e.xpath('./a/text()')[0].strip())

            # log('解析的新闻URL', url)

            urls.append(url)
        return urls


    def parser_huobi(self, content):
        html = etree.HTML(content.text)
        doms = html.xpath('//a[@class="hei12jj"]')

        urls = []

        for e in doms:

            url = self.host_url + e.xpath('./@href')[0].strip()
            # log('标题', e.xpath('./a/text()')[0].strip())

            # log('解析的新闻URL', url)

            urls.append(url)
        return urls


    def parser_xindai(self, content):
        html = etree.HTML(content.text)
        doms = html.xpath('//a[@class="hei12jj"]')

        urls = []

        for e in doms:

            url = self.host_url + e.xpath('./@href')[0].strip()
            # log('标题', e.xpath('./text()')[0].strip())

            # log('解析的新闻URL', url)

            urls.append(url)
        return urls


    def parse_gonggao_item(self, response):
        '''
        解析公告信息详情
        :param response:
        :return:
        '''
        try:
            title = response.xpath('//h2[@style="FONT-SIZE: 16px"]/text()')
            title = ''.join(title).strip()
        except Exception as e:
            title = '未知'

        try:
            date = response.xpath('//td[@class="hui12"][@align="right"]/text()')[0]
        except Exception as e:
            date = '未知'
        try:
            con_list = response.xpath('//font[@id="zoom"]/descendant-or-self::*/text()')
        except Exception as e:
            con_list = ['未知']
        content = ''.join(con_list).strip()
        # log('content', title, date)
        return title, date, content


    def parser_common_item(self, response):
        '''
        解析法律法规  信贷政策
        '''
        try:
            title = response.xpath('//h2[@style="font-size: 16px;color: #333;"]/text()')
            title = ''.join(title).strip()
        except Exception as e:
            title = '未知'

        try:
            date = response.xpath('//span[@id="shijian"]/text()')[0].split()[0]
        except Exception as e:
            date = '未知'
        try:
            con_list = response.xpath('//div[@id="zoom"]/descendant-or-self::*/text()')
        except Exception as e:
            con_list = ['未知']
        content = ''.join(con_list).strip()
        # log('法律法规', content)
        return title, date, content

    def send(self, dest_url, get_news_list, parser_news):
        '''
        获取人行 不同页面的内容 解析 再次请求新闻详情
        :param dest_url:  目标URL
        :param get_news_list: 获取目标URL中的新闻URL
        :param parser_news: 解析新闻中的具体标题 日期等信息
        :return: None
        '''
        content = self.get_html(dest_url)
        urls = get_news_list(content)
        news_list = self.send_request(urls, parser_news)
        for news in news_list:
            self.mgr.insert(news)


    def run(self):
        log_line('PbcSpider 启动!!!')

        # 公告信息
        dest_url = 'http://www.pbc.gov.cn/rmyh/105208/index.html'
        self.send(dest_url, self.parser_gonggao_list, self.parse_gonggao_item)

        # 法律法规
        dest_url = 'http://www.pbc.gov.cn/tiaofasi/144941/index.html'
        self.send(dest_url, self.parser_falvfagui, self.parser_common_item)


        # 货币政策  暂未完成
        # dest_url = 'http://www.pbc.gov.cn/rmyh/105145/index.html'
        # self.send(dest_url, self.parser_xindai, self.parser_common_item)


        # 信贷政策
        dest_url = 'http://www.pbc.gov.cn/jinrongshichangsi/147160/147289/index.html'
        self.send(dest_url, self.parser_xindai, self.parser_common_item)


        self.__class__().re_send()
Example #2
0
class MoHurdSpider(BaseSpider):
    def __init__(self):
        self.headers = {}
        self.mgr = MogoMgr()
        self.newslist = []
        self.start_urls = [
            'http://www.mohurd.gov.cn/zcjd/index.html',
            'http://www.mohurd.gov.cn/fdcy/fdcyzcfb/index.html',
            'http://www.mohurd.gov.cn/fdcy/fdcyxydt/index.html',
            'http://www.mohurd.gov.cn/fdcy/fdcydfxx/index.html',
        ]

    def get_news_header(self):
        '''
        请求新闻列表的请求头与请求新闻详情的请求头不一样
        :return:
        '''
        return {
            'Host': 'www.mohurd.gov.cn',
            'User-Agent': randomUserAgent(),
            'Pragma': 'no-cache',
        }

    def get_html(self, url):
        '''
        :param url:
        :return:
        '''

        html = requests.get(url, headers=self.get_news_header())
        html.encoding = 'utf-8'
        html = etree.HTML(html.text)
        items = html.xpath('//a[@style="color:#000;;font-size:12px;"]')

        # log_line(len(items))

        for item in items:
            self.parser_item(item)

    def parser_item(self, item):
        news = News()
        news.spider_name = 'mohurd'
        news.url = item.xpath('./@href')[0]
        news.title = item.xpath('./text()')[0]
        news.date = item.getparent().getnext().xpath(
            './text()')[0][1:-1].replace('.', '-').strip()
        self.newslist.append(news)

    def get_newsUrls(self):
        return [news.url for news in self.newslist]

    def send_request(self, urls):

        for url in urls:
            # 避免重复请求
            find_one = self.mgr.find_one('url', url)
            if find_one is not None:
                log_line('该URL已经存在 无需请求')
                log(url)
                continue
            content = self.get_content(url)
            if content == 'error' or content == 'timeout':
                continue
            self.update_content(url, content)

    def get_content(self, url):
        '''
        请求每一个新闻详情
        :param url:
        :return:
        '''
        t_sleep()
        log('当前访问的URL', url)

        try:
            html = requests.get(url, headers=self.get_news_header(), timeout=3)
            html.encoding = 'utf-8'
        except Exception as e:
            log_line('访问出错')
            print(e)
            self.__class__.retry = 1

            return 'timeout'

        if html.status_code != 200:
            log('访问的URL出错!!!', url)
            return 'error'

        response = etree.HTML(html.text)

        con_list = response.xpath(
            '//div[@class="union"]/descendant-or-self::*/text()')
        return ''.join(con_list).strip()

    def update_content(self, url, content):
        for news in self.newslist:
            if news.url == url:
                news.content = content

    def run(self):
        log_line('MoHurdSpider 启动!!!')

        for url in self.start_urls:
            self.get_html(url)
            self.send_request(self.get_newsUrls())

            for news in self.newslist:
                find_one = self.mgr.find_one('url', news.url)
                if find_one is not None:
                    log_line('该URL已经存在 无需写入')
                    log(news.url)
                    continue
                self.mgr.insert(news)

        self.__class__().re_send()
Example #3
0
class XinHuaSpider(BaseSpider):


    def __init__(self):
        self.headers = {}
        self.mgr = MogoMgr()

    def get_newlist_header(self):
        '''
        请求新闻列表的请求头与请求新闻详情的请求头不一样
        :return:
        '''
        return {
            'Host': 'qc.wa.news.cn',
            'Referer': 'http://www.news.cn/fortune/',
            'User-Agent': randomUserAgent(),
            'Pragma': 'no-cache',

        }


    def get_news_header(self):
        '''
        请求新闻列表的请求头与请求新闻详情的请求头不一样
        :return:
        '''
        return {
            'Host': 'www.xinhuanet.com',
            'User-Agent': randomUserAgent(),
            'Pragma': 'no-cache',

        }


    def get_caijing_header(self):
        pass
        return {
            'Host': 'www.news.cn',
            'User-Agent': randomUserAgent(),
            'Pragma': 'no-cache',

        }


    def get_money(self):
        '''
        金融版面
        :return:
        '''
        url = 'http://www.xinhuanet.com/money/index.htm'
        html = requests.get(url, headers=self.get_news_header())
        html.encoding = 'utf-8'

        html = etree.HTML(html.text)

        urls_all = []

        urls_1 = html.xpath('//li[@class="clearfix"]/h3/a/@href')

        # 只对新闻列表进行处理
        urls_2 = html.xpath('//li[@class="imp"]/a/@href')
        urls_3 = html.xpath('//div[@class="swiper-slide"]/a/@href')

        urls_all.extend(urls_1)
        urls_all.extend(urls_2)
        urls_all.extend(urls_3)

        # log(len(urls_all), urls_all)

        news_list = []

        for url in urls_all:
            # 避免重复请求
            find_one = self.mgr.find_one('url', url)
            if find_one is not None:
                log_line('该URL已经存在 无需请求')
                log(url)
                continue

            news = self.get_iteminfo(url)
            if news == 'timeout' or news == 'error':
                continue
            news_list.append(news)
        return news_list



    def get_lunbo(self):
        '''
        财经版面
        :return:
        '''
        url = 'http://www.news.cn/fortune/'
        html = requests.get(url, headers=self.get_caijing_header())
        html.encoding = 'utf-8'

        html = etree.HTML(html.text)
        urls = html.xpath('//div[@class="swiper-slide"]/a/@href')

        year = arrow.now().date().year

        news_list = []

        for url in urls:
            if str(year)  in url:
                log('需要访问的URL 轮播图', url)
                find_one = self.mgr.find_one('url', url)
                if find_one is not None:
                    log_line('该URL已经存在 无需请求')
                    log(url)
                    continue
                news = self.get_iteminfo(url)
                if news == 'timeout' or news == 'error':
                    continue
                news_list.append(news)
        return news_list



    def get_itemlist(self, page='1'):
        '''
        获取新华财经 所有新闻详情
        :return: 返回新闻model
        '''

        # 新华财经  -  新闻列表
        url = 'http://qc.wa.news.cn/nodeart/list?nid=11147664&pgnum={0}&cnt=16&tp=1&orderby=1'.format(page)

        html = requests.get(url, headers=self.get_newlist_header())
        items = json.loads(html.text[1:-1])
        items = items['data']['list']

        news_list = []

        for item in items:
            # 避免重复请求
            find_one = self.mgr.find_one('url', item['LinkUrl'])
            if find_one is not None:
                log_line('该URL已经存在 无需请求')
                log(item['LinkUrl'])
                continue

            news = self.get_iteminfo(item['LinkUrl'])
            if news == 'timeout' or news == 'error':
                continue
            news_list.append(news)
        return news_list

    def get_iteminfo(self, url):
        '''
        访问每一条新闻详情
        :param itemlist: 新闻链接集合
        :return: 新闻model
        '''
        t_sleep()

        log('当前访问的URL', url)


        try:
            html = requests.get(url, headers=self.get_news_header(), timeout=3)
            html.encoding = 'utf-8'
        except Exception as e:
            log_line('访问出错')
            print(e)
            return 'timeout'

        if html.status_code != 200:
            log('访问的URL出错!!!', url)
            return 'error'

        response = etree.HTML(html.text)
        title, date, content = self.parse_item(response)

        news = News(title=title, date=date, content=content, url=url)
        news.spider_name = 'xinhua'
        return news


    def parse_item(self, response):
        try:
            con_list = response.xpath('//div[@id="p-detail"]/p')
            content = self.pasre_content(con_list)
            title = response.xpath('//div[@class="h-title"]/text()')[0].strip()
            date = response.xpath('//span[@class="h-time"]/text()')[0].split()[0]
        except Exception as e:

            title = '页面不存在'
            date = '页面不存在'
            content = '页面不存在'


        return title, date, content

    def pasre_content(self, con_list):
        '''
        解析正文
        :param response:
        :return:
        '''

        content = ''

        for con in con_list:
            c = con.xpath('./text()')
            if len(c) != 0:
                content = content + c[0].replace(' ', '')

        return content

    def run(self):
        log_line('XinHuaSpider 启动!!!')

        news_list = []
        # 对财经版面的前两页数据进行爬取
        news_list_1 = self.get_itemlist(page='1')
        news_list_2 = self.get_itemlist(page='2')

        news_list_3 = self.get_lunbo()
        news_list_4 = self.get_money()
        news_list.extend(news_list_1)
        news_list.extend(news_list_2)
        news_list.extend(news_list_3)
        news_list.extend(news_list_4)

        for news in news_list:
            self.mgr.insert(news)

        self.__class__().re_send()
Example #4
0
class CircSpider(BaseSpider):
    def __init__(self):
        self.headers = {}
        self.mgr = MogoMgr()
        self.newslist = []
        self.start_urls = [
            'http://www.circ.gov.cn/web/site0/tab5176/',
            'http://www.circ.gov.cn/web/site0/tab7924/',
            'http://www.circ.gov.cn/web/site0/tab5207/',
        ]

    def get_news_header(self):
        '''
        请求新闻列表的请求头与请求新闻详情的请求头不一样
        :return:
        '''
        return {
            'Host': 'www.circ.gov.cn',
            'User-Agent': randomUserAgent(),
            'Pragma': 'no-cache',
        }

    def get_html(self, url):
        '''
        :param url:
        :return:
        '''

        html = requests.get(url, headers=self.get_news_header())
        html.encoding = 'utf-8'

        # log(html.text)

        html = etree.HTML(html.text)
        items = html.xpath('//td[@class="hui14"]')

        # log(len(items))

        for item in items:
            self.parser_item(item)

    def parser_item(self, item):
        url = item.xpath('./a/@href')[0]
        if 'search' in url:
            return

        date = item.getnext().xpath('./text()')[0][1:-1]

        news = News()
        news.spider_name = 'circ'
        news.url = self.parser_url(url, 'http://www.circ.gov.cn')
        news.title = item.xpath('./a/text()')[0]
        news.date = date

        # log(news.url, news.title, news.date)

        self.newslist.append(news)

    def parser_url(self, url, base_url):
        return base_url + url

    def get_newsUrls(self):
        return [news.url for news in self.newslist]

    def send_request(self, urls):

        for url in urls:
            # 避免重复请求
            find_one = self.mgr.find_one('url', url)
            if find_one is not None:
                log_line('该URL已经存在 无需请求')
                log(url)
                continue
            content = self.get_content(url)
            if content == 'timeout' or content == 'error':
                continue
            self.update_content(url, content)

    def get_content(self, url):
        '''
        请求每一个新闻详情
        :param url:
        :return:
        '''
        t_sleep()
        log('当前访问的URL', url)

        try:
            html = requests.get(url, headers=self.get_news_header(), timeout=3)
            html.encoding = 'utf-8'
        except Exception as e:
            log_line('访问出错')
            print(e)
            self.__class__.retry = 1

            return 'timeout'

        if html.status_code != 200:
            return 'error'

        response = etree.HTML(html.text)

        con_list = response.xpath(
            '//span[@id="zoom"]/descendant-or-self::*/text()')
        return ''.join(con_list).strip().replace('\r\n', '')

    def update_content(self, url, content):
        for news in self.newslist:
            if news.url == url:
                news.content = content

    def run(self):

        log_line('CircSpider 启动!!!')

        for url in self.start_urls:
            self.get_html(url)
            self.send_request(self.get_newsUrls())

            # for news in self.newslist:
            #     log(news.url, news.content)
            #
            for news in self.newslist:
                find_one = self.mgr.find_one('url', news.url)
                if find_one is not None:
                    log_line('该URL已经存在 无需写入')
                    log(news.url)
                    continue
                self.mgr.insert(news)

        self.__class__().re_send()
Example #5
0
class StcnSpider(BaseSpider):
    def __init__(self):
        self.headers = {}
        self.mgr = MogoMgr()

    # def get_date(self):
    #     year, month, day = get_today()
    #     date = str(year) + '-' + str(month) + '-' + str(day)
    #     return date

    def get_host(self, url):
        host = url.split('/')[2]
        return host

    def get_news_header(self):
        '''
        请求新闻列表的请求头与请求新闻详情的请求头不一样
        :return:
        '''
        return {
            # 'Host': 'epaper.zqrb.cn',
            'User-Agent': randomUserAgent(),
            'Pragma': 'no-cache',
            'Referer': 'http://www.stcn.com/',
        }

    def get_html(self, url):
        '''
        :param url:
        :return:
        '''
        html = requests.get(url)
        html.encoding = 'gbk'

        # log(html.text)

        pattern = r"http://[a-z]+\.stcn.com/\d+/\d+/\d+.shtml"

        urls = re.findall(pattern, html.text)

        # new_urls = []
        # for ur in urls:
        #     log(ur)
        # new_urls.append(self.parser_url(ur))

        # log('数量', len(urls))
        return urls

    def send_request(self, urls):
        news_list = []
        for url in urls:
            # 避免重复请求
            find_one = self.mgr.find_one('url', url)
            if find_one is not None:
                log_line('该URL已经存在 无需请求')
                log(url)
                continue

            news = self.get_newsinfo(url)

            if news == 'error' or news == 'timeout':
                continue

            news_list.append(news)
        return news_list

    def get_newsinfo(self, url):
        '''
        请求每一个新闻详情
        :param url:
        :return:
        '''
        t_sleep()

        log('当前访问的URL', url)

        try:
            html = requests.get(url, headers=self.get_news_header(), timeout=3)
            html.encoding = 'utf-8'
        except Exception as e:
            log_line('访问出错')
            print(e)
            self.__class__.retry = 1

            return 'timeout'

        response = etree.HTML(html.text)

        if html.status_code != 200:
            log('访问的URL出错!!!', url)
            return 'error'

        title, date, content = self.parse_item(response)
        news = News(title=title, date=date, content=content, url=url)
        news.spider_name = 'stcn'
        return news

    def parse_item(self, response):

        try:
            title = response.xpath('//div[@class="intal_tit"]/h2/text()')
            title = ''.join(title).strip()
        except Exception as e:
            title = '未知'

        try:
            date = response.xpath('//div[@class="info"]/text()')[0].split()[0]
        except Exception as e:
            date = '未知'
        try:
            con_list = response.xpath(
                '//div[@id="ctrlfscont"]/descendant-or-self::*/text()')
        except Exception as e:
            con_list = ['未知']
        content = ''.join(con_list).strip()
        # log('content', content)
        return title, date, content

    def run(self):
        log_line('StcnSpider 启动!!!')

        url = 'http://www.stcn.com/'

        urls = self.get_html(url)
        news_list = self.send_request(urls)

        for news in news_list:
            self.mgr.insert(news)

        self.__class__().re_send()
Example #6
0
class ShangHaiSpider(BaseSpider):
    def __init__(self):
        self.headers = {}
        self.mgr = MogoMgr()

    def get_news_header(self):
        '''
        请求新闻列表的请求头与请求新闻详情的请求头不一样
        :return:
        '''
        return {
            'Host': 'www.shanghai.gov.cn',
            'User-Agent': randomUserAgent(),
            'Pragma': 'no-cache',
        }

    def get_html(self, url):
        '''
        :param url:
        :return:
        '''
        html = requests.get(url)
        html.encoding = 'utf-8'
        html = etree.HTML(html.text)
        urls = html.xpath('//ul[@class="uli14 pageList"]/li/a/@href')

        return self.parser_url(urls)

    def parser_url(self, urls):
        base_url = 'http://www.shanghai.gov.cn'
        new_urls = []
        for url in urls:
            if str(url).endswith('.pdf'):
                continue

            url = base_url + url
            new_urls.append(url)
        return new_urls

    def send_request(self, urls):
        news_list = []
        for url in urls:
            # 避免重复请求
            find_one = self.mgr.find_one('url', url)
            if find_one is not None:
                log_line('该URL已经存在 无需请求')
                log(url)
                continue
            news = self.get_newsinfo(url)
            if news == 'timeout' or news == 'error':
                continue
            news_list.append(news)
        return news_list

    def get_newsinfo(self, url):
        '''
        请求每一个新闻详情
        :param url:
        :return:
        '''
        t_sleep()
        log('当前访问的URL', url)

        try:
            html = requests.get(url, headers=self.get_news_header(), timeout=3)
            html.encoding = 'gbk'
        except Exception as e:
            log_line('访问出错')
            print(e)
            self.__class__.retry = 1

            return 'timeout'

        if html.status_code != 200:
            log('访问的URL出错!!!', url)
            return 'error'

        response = etree.HTML(html.text)

        title, date, content = self.parse_item(response)
        news = News(title=title, date=date, content=content, url=url)
        news.spider_name = 'shanghai'
        return news

    def parse_item(self, response):

        title = response.xpath('//div[@id="ivs_title"]/text()')[0].strip()
        date = response.xpath('//div[@id="ivs_date"]/text()')[0][1:-1].strip()
        date = arrow.get(date).format('YYYY-MM-DD')

        con_list = response.xpath(
            '//div[@id="ivs_content"]/descendant-or-self::*/text()')
        content = ''.join(con_list).strip()

        return title, date, content

    def run(self):
        log_line('ShangHaiSpider 启动!!!')

        url = 'http://www.shanghai.gov.cn/nw2/nw2314/nw2319/nw41893/index.html'
        urls = self.get_html(url)

        news_list = self.send_request(urls)

        for news in news_list:
            self.mgr.insert(news)

        self.__class__().re_send()
Example #7
0
class CbrcSpider(BaseSpider):
    def __init__(self):
        self.headers = {}
        self.mgr = MogoMgr()
        self.newslist = []
        self.start_url = 'http://www.cbrc.gov.cn/chinese/zhengcefg.html'

    def get_news_header(self):
        '''
        请求新闻列表的请求头与请求新闻详情的请求头不一样
        :return:
        '''
        return {
            'Host': 'www.cbrc.gov.cn',
            'User-Agent': randomUserAgent(),
            'Pragma': 'no-cache',
        }

    def get_html(self, url):
        html = requests.get(url, headers=self.get_news_header())
        html.encoding = 'utf-8'
        html = etree.HTML(html.text)
        items = html.xpath('//a[@class="STYLE8"]')

        for item in items:
            news = News()
            news.spider_name = 'cbrc'
            news.url = item.xpath('./@href')[0]
            news.title = item.xpath('./@title')[0]
            news.date = item.getparent().getnext().xpath('./text()')[0].strip()

            self.newslist.append(news)

        return self.parser_url(self.newslist)

    def parser_url(self, newslist):
        base_url = 'http://www.cbrc.gov.cn'
        new_urls = []
        for news in newslist:
            url = base_url + news.url
            news.url = url
            # log('拼接后的URL', url)
            new_urls.append(url)
        return new_urls

    def send_request(self, urls):
        for url in urls:
            # 避免重复请求
            find_one = self.mgr.find_one('url', url)
            if find_one is not None:
                log_line('该URL已经存在 无需请求')
                log(url)
                continue

            content = self.get_content(url)
            if content == 'timeout' or 'error':
                continue

            for news in self.newslist:
                if news.url == url:
                    news.content = content

    def get_content(self, url):
        '''
        请求每一个新闻详情
        :param url:
        :return:
        '''
        t_sleep()
        log('当前访问的URL', url)

        try:
            html = requests.get(url, headers=self.get_news_header(), timeout=3)
            html.encoding = 'utf-8'
        except Exception as e:
            log_line('访问出错')
            print(e)
            self.__class__.retry = 1

            return 'timeout'

        if html.status_code != 200:
            return 'error'

        response = etree.HTML(html.text)

        return self.parse_item(response)

    def parse_item(self, response):

        try:
            con_list = response.xpath(
                '//div[@class="notice_t"]/descendant-or-self::*/text()')
            content = ''.join(con_list).strip().replace('\r\n', '')
        except Exception as e:
            content = '页面不存在'
        return content

    def run(self):
        log_line('CbrcSpider 启动!!!')

        urls = self.get_html(self.start_url)
        self.send_request(urls)

        for news in self.newslist:
            find_one = self.mgr.find_one('url', news.url)
            if find_one is not None:
                log_line('该URL已经存在 无需写入')
                log(news.url)
                continue
            self.mgr.insert(news)

        self.__class__().re_send()
Example #8
0
class FangChanSpider(BaseSpider):
    def __init__(self):
        self.headers = {}
        self.mgr = MogoMgr()

    def get_news_header(self):
        '''
        请求新闻列表的请求头与请求新闻详情的请求头不一样
        :return:
        '''
        return {
            'Host': 'www.fangchan.com',
            'User-Agent': randomUserAgent(),
            'Pragma': 'no-cache',
            'Referer': 'http://www.fangchan.com/policy/28/',
        }

    def get_html(self, url):
        '''
        :param url:
        :return:
        '''
        html = requests.get(url)
        html.encoding = 'utf-8'
        html = etree.HTML(html.text)
        urls = html.xpath('//ul[@class="related-news-list"]/li/a/@href')

        return urls

    def send_request(self, urls):
        news_list = []
        for url in urls:
            # 避免重复请求
            find_one = self.mgr.find_one('url', url)
            if find_one is not None:
                log_line('该URL已经存在 无需请求')
                log(url)
                continue
            news = self.get_newsinfo(url)

            if news == 'error' or news == 'timeout':
                continue
            news_list.append(news)
        return news_list

    def get_newsinfo(self, url):
        '''
        请求每一个新闻详情
        :param url:
        :return:
        '''
        t_sleep()
        log('当前访问的URL', url)

        try:
            html = requests.get(url, headers=self.get_news_header(), timeout=3)
            html.encoding = 'utf-8'
        except Exception as e:
            log_line('访问出错')
            print(e)
            self.__class__.retry = 1

            return 'timeout'

        if html.status_code != 200:
            log('访问的URL出错!!!', url)
            return 'error'

        response = etree.HTML(html.text)

        title, date, content = self.parse_item(response)
        news = News(title=title, date=date, content=content, url=url)
        news.spider_name = 'fangchan'
        return news

    def parse_item(self, response):

        try:
            title = response.xpath(
                '//div[@class="section top"]/h1/text()')[0].strip()
        except Exception as e:
            title = response.xpath('//h1[@class="clearfix"]/text()')[0].strip()

        try:
            date = response.xpath(
                '/html/body/div[1]/div[2]/div[1]/p/span[2]/text()')[0].split(
                )[0]
        except Exception as e:
            try:
                date = response.xpath(
                    '/html/body/div/div[2]/div/div[2]/ul/li[2]/span/text()'
                )[0].split()[0]
            except Exception as e:
                date = '未知'

        con_list = response.xpath(
            '//div[@class="summary-text"]/descendant-or-self::*/text()')
        if len(con_list) == 0:
            con_list = response.xpath(
                '//div[@class="summary_text"]/descendant-or-self::*/text()')

        content = ''.join(con_list).strip()

        return title, date, content

    def run(self):

        log_line('FangChanSpider 启动!!!')

        start_urls = [
            'http://www.fangchan.com/policy/28/',
            'http://www.fangchan.com/plus/nlist.php?tid=2&tags=%E5%8E%9F%E5%88%9B',
            'http://www.fangchan.com/plus/nlist.php?tid=2&column=%E5%AE%8F%E8%A7%82',
            'http://www.fangchan.com/news/6/',
            'http://www.fangchan.com/news/1/',
            'http://www.fangchan.com/news/9/',
            'http://www.fangchan.com/news/5/',
            'http://www.fangchan.com/news/7/',
            'http://www.fangchan.com/news/4/',
        ]

        for url in start_urls:
            urls = self.get_html(url)
            news_list = self.send_request(urls)

            for news in news_list:
                self.mgr.insert(news)

        self.__class__().re_send()
Example #9
0
class CctvSpider(BaseSpider):
    def __init__(self):
        self.headers = {}
        self.mgr = MogoMgr()

    def get_news_header(self):
        '''
        请求新闻列表的请求头与请求新闻详情的请求头不一样
        :return:
        '''
        return {
            'Host': 'jingji.cctv.com',
            'User-Agent': randomUserAgent(),
            'Pragma': 'no-cache',
        }

    def get_html(self, url):
        '''
        获取顶部 轮播图
        :param url:
        :return:
        '''
        html = requests.get(url)
        html.encoding = 'utf-8'
        # log(html.text)

        html = etree.HTML(html.text)
        urls = html.xpath('//div[@class="shadow"]/ul/li/p/a/@href')
        # log(len(urls), urls)
        return urls

    def get_jsondata(self):
        '''
        直接访问json接口
        :return:
        '''

        url = 'http://jingji.cctv.com/data/index.json'
        html = requests.get(url)
        html.encoding = 'gbk'
        news_list = json.loads(html.text)['rollData']

        urls = []
        for news in news_list:
            urls.append(news['url'])

        return urls

    def send_request(self, urls):
        news_list = []
        for url in urls:
            # 避免重复请求
            find_one = self.mgr.find_one('url', url)
            if find_one is not None:
                log_line('该URL已经存在 无需请求')
                log(url)
                continue
            news = self.get_newsinfo(url)
            if news == 'timeout' or news == 'error':
                continue
            news_list.append(news)
        return news_list

    def get_newsinfo(self, url):
        '''
        请求每一个新闻详情
        :param url:
        :return:
        '''
        t_sleep()
        log('当前访问的URL', url)

        try:
            html = requests.get(url, headers=self.get_news_header(), timeout=3)
            html.encoding = 'utf-8'
        except Exception as e:
            log_line('访问出错')
            print(e)
            self.__class__.retry = 1

            return 'timeout'

        if html.status_code != 200:
            return 'error'

        response = etree.HTML(html.text)

        title, date, content = self.parse_item(response)
        news = News(title=title, date=date, content=content, url=url)
        news.spider_name = 'cctv'
        return news

    def parse_item(self, response):
        try:
            title = response.xpath(
                '//div[@class="cnt_bd"]/h1/text()')[0].strip()
        except Exception as e:

            title = '页面不存在'

        try:
            date = response.xpath(
                '//span[@class="info"]/i/text()')[1].split()[0]
            date = arrow.get(date).format('YYYY-MM-DD')

        except Exception as e:
            try:
                date = response.xpath(
                    '//span[@class="info"]/i/text()')[0].split()[1]
                date = arrow.get(date).format('YYYY-MM-DD')
            except Exception as e:
                date = '未知'

        try:
            con_list = response.xpath('//div[@class="cnt_bd"]/p')
            content = self.pasre_content(con_list)
        except Exception as e:
            content = '页面不存在'

        # log(content)
        # log(title, date)

        return title, date, content

    def pasre_content(self, con_list):
        '''
        解析正文
        :param response:
        :return:
        '''

        content = ''

        for con in con_list:
            c = con.xpath('./text()')
            if len(c) != 0:
                content = content + c[0].replace(' ', '')

        return content

    def run(self):

        log_line('CctvSpider 启动!!!')

        urls = []
        url = 'http://jingji.cctv.com/'
        urls_1 = self.get_html(url)
        urls_2 = self.get_jsondata()
        urls.extend(urls_1)
        urls.extend(urls_2)
        urls = set(urls)

        news_list = self.send_request(urls)

        for news in news_list:
            self.mgr.insert(news)

        self.__class__().re_send()
Example #10
0
class CsrcSpider(BaseSpider):
    def __init__(self):
        self.headers = {}
        self.mgr = MogoMgr()
        self.newslist = []
        self.start_urls = [
            'http://www.csrc.gov.cn/pub/zjhpublic/3300/3302/index_7401.htm',
            'http://www.csrc.gov.cn/pub/zjhpublic/3300/3311/index_7401.htm',
        ]

    def get_news_header(self):
        '''
        请求新闻列表的请求头与请求新闻详情的请求头不一样
        :return:
        '''
        return {
            'Host': 'www.csrc.gov.cn',
            'User-Agent': randomUserAgent(),
            'Pragma': 'no-cache',
        }

    def get_html(self, url):
        '''
        :param url:
        :return:
        '''

        html = requests.get(url, headers=self.get_news_header())
        html.encoding = 'utf-8'
        html = etree.HTML(html.text)
        items = html.xpath('//div[@class="row"]')

        # log_line(len(items))

        for item in items:
            self.parser_item(item)

    def parser_item(self, item):
        url = item.xpath('./li[@class="mc"]/div/a/@href')[0]
        date = item.xpath('./li[@class="fbrq"]/text()')[0]

        news = News()
        news.spider_name = 'csrc'
        news.url = self.parser_url(url, 'http://www.csrc.gov.cn/pub/zjhpublic')
        news.title = item.xpath('./li[@class="mc"]/div/a/text()')[0]
        news.date = arrow.get(date).format('YYYY-MM-DD')

        # log(news.url, news.title, news.date)
        self.newslist.append(news)

    def parser_url(self, url, base_url):
        return base_url + url.split('../..')[1]

    def get_newsUrls(self):
        return [news.url for news in self.newslist]

    def send_request(self, urls):

        for url in urls:
            # 避免重复请求
            find_one = self.mgr.find_one('url', url)
            if find_one is not None:
                log_line('该URL已经存在 无需请求')
                log(url)
                continue
            content = self.get_content(url)
            if content == 'error' or content == 'timeout':
                continue
            self.update_content(url, content)

    def get_content(self, url):
        '''
        请求每一个新闻详情
        :param url:
        :return:
        '''
        t_sleep()
        log('当前访问的URL', url)

        try:
            html = requests.get(url, headers=self.get_news_header(), timeout=3)
            html.encoding = 'utf-8'
        except Exception as e:
            log_line('访问出错')
            print(e)
            self.__class__.retry = 1

            return 'timeout'

        if html.status_code != 200:
            log('访问的URL出错!!!', url)
            return 'error'

        response = etree.HTML(html.text)

        con_list = response.xpath(
            '//div[@id="ContentRegion"]/descendant-or-self::*/text()')
        return ''.join(con_list).strip().replace('\r\n', '')

    def update_content(self, url, content):
        for news in self.newslist:
            if news.url == url:
                news.content = content

    def run(self):
        log_line('CsrcSpider 启动!!!')

        for url in self.start_urls:
            self.get_html(url)
            self.send_request(self.get_newsUrls())

            for news in self.newslist:
                find_one = self.mgr.find_one('url', news.url)
                if find_one is not None:
                    log_line('该URL已经存在 无需写入')
                    log(news.url)
                    continue
                self.mgr.insert(news)

        self.__class__().re_send()
Example #11
0
class HeXunSpider(BaseSpider):

    def __init__(self):
        self.headers = {}
        self.mgr = MogoMgr()

    def get_host(self, url):
        host = url.split('/')[2]
        return host

    def get_news_header(self):
        '''
        请求新闻列表的请求头与请求新闻详情的请求头不一样
        :return:
        '''
        return {
            # 'Host': '',
            'User-Agent': randomUserAgent(),
            'Pragma': 'no-cache',
            'Referer': 'http://www.hexun.com/',
        }


    def get_html(self, url):
        '''
        :param url:
        :return:
        '''
        html = requests.get(url)
        html.encoding = 'gbk'

        pattern = r"http://[a-z]{4,10}.hexun.com/\d+-\d+-\d+/\d+.html"
        urls = re.findall(pattern, html.text)

        # log(html.text)

        # for ur in  urls:
        #     log(type(ur), ur)
        #
        #
        # log('数量', len(urls))

        return urls

    def send_request(self, urls):
        news_list = []
        for url in urls:
            # 避免重复请求
            find_one = self.mgr.find_one('url', url)
            if find_one is not None:
                log_line('该URL已经存在 无需请求')
                log(url)
                continue

            news = self.get_newsinfo(url)

            if news == 'error' or news == 'timeout':
                continue

            news_list.append(news)
        return news_list


    def get_newsinfo(self, url):
        '''
        请求每一个新闻详情
        :param url:
        :return:
        '''
        t_sleep()
        log('当前访问的URL', url)


        try:
            html = requests.get(url, headers=self.get_news_header(), timeout=3)
            html.encoding = 'gbk'
        except Exception as e:
            log_line('访问出错')
            print(e)
            self.__class__.retry = 1

            return 'timeout'

        if html.status_code != 200:
            log('访问的url 状态不是200', url)
            return 'error'

        response = etree.HTML(html.text)

        self.parse_item(response)

        title, date, content = self.parse_item(response)
        news = News(title=title, date=date, content=content, url=url)
        news.spider_name = 'hexun'
        return news

    def parse_item(self, response):

        try:
            title = response.xpath('//div[@class="layout mg articleName"]/h1/text()')[0].strip()
        except Exception as e:
            title = '未知'
        try:
            date = response.xpath('//span[@class="pr20"]/text()')[0].split()[0]
        except Exception as e:
            date = '未知'
        try:
            con_list = response.xpath('//div[@class="art_contextBox"]/descendant-or-self::*/text()')
        except Exception as e:
            con_list = '未知'

        content = ''.join(con_list).strip()
        # log('content', content)
        return title, date, content

    def run(self):
        log_line('HeXunSpider 启动!!!')


        start_urls = [
            'http://www.hexun.com/',
        ]

        for url in  start_urls:
            urls = self.get_html(url)
            news_list = self.send_request(urls)

            for news in news_list:
                self.mgr.insert(news)

        self.__class__().re_send()
Example #12
0
class GzjrjSpider(BaseSpider):

    def __init__(self):
        self.headers = {}
        self.mgr = MogoMgr()

    def get_news_header(self):
        '''
        请求新闻列表的请求头与请求新闻详情的请求头不一样
        :return:
        '''
        return {
            'Host': 'www.shanghai.gov.cn',
            'User-Agent': randomUserAgent(),
            'Pragma': 'no-cache',

        }


    def get_html(self, url):
        '''
        :param url:
        :return:
        '''
        html = requests.get(url)
        html.encoding = 'utf-8'
        html = etree.HTML(html.text)
        urls = html.xpath('//div[@class="mainContent"]/ul/li/a/@href')
        log('提取的URL', urls)

        return self.parser_url(urls)


    def parser_url(self, urls):
        base_url = 'http://www.gzjr.gov.cn/'
        new_urls = []
        for url in urls:
            if str(url).endswith('.pdf'):
                continue

            url = base_url + url.split('../../')[1]
            log('拼接后的url', url)
            new_urls.append(url)
        return new_urls


    def send_request(self, urls):
        news_list = []
        for url in urls:
            # 避免重复请求
            find_one = self.mgr.find_one('url', url)
            if find_one is not None:
                log_line('该URL已经存在 无需请求')
                log(url)
                continue
            news = self.get_newsinfo(url)
            news_list.append(news)
        return news_list


    def get_newsinfo(self, url):
        '''
        请求每一个新闻详情
        :param url:
        :return:
        '''
        t_sleep()

        try:
            html = requests.get(url, headers=self.get_news_header(), timeout=3)
            html.encoding = 'gbk'
        except Exception as e:
            log_line('访问出错')
            print(e)
            return 'timeout'




        response = etree.HTML(html.text)
        log('当前访问的URL', url)

        title, date, content = self.parse_item(response)
        news = News(title=title, date=date, content=content, url=url)
        return news


    def parse_item(self, response):

        title = response.xpath('//div[@id="ivs_title"]/text()')[0].strip()
        date = response.xpath('//div[@id="ivs_date"]/text()')[0][1:-1].strip()
        date = arrow.get(date).format('YYYY-MM-DD')

        con_list = response.xpath('//div[@id="ivs_content"]/descendant-or-self::*/text()')
        content = ''.join(con_list).strip()

        return title, date, content


    def run(self):
        log_line('GzjrjSpider 启动!!!')

        pass
Example #13
0
class CnstockSpider(BaseSpider):

    def __init__(self):
        self.headers = {}
        self.mgr = MogoMgr()

    def get_news_header(self):
        '''
        请求新闻列表的请求头与请求新闻详情的请求头不一样
        :return:
        '''
        return {
            # 'Host': '',
            'User-Agent': randomUserAgent(),
            'Pragma': 'no-cache',
            'Referer': 'http://www.cnstock.com/',
        }


    def get_html(self, url):
        '''
        :param url:
        :return:
        '''
        html = requests.get(url)
        html.encoding = 'utf-8'

        # log(html.text)

        pattern_1 = r"http://[a-z]+.cnstock.com/[a-z]+,[a-z]+-\d+-\d+.htm"
        pattern_2 = r"http://[a-z]+.cnstock.com/[a-z]+/[a-z]+_[a-z]+/\d+/\d+.htm"
        pattern_3 = r"http://[a-z]+.cnstock.com/[a-z]+/[a-z]+_[a-z]+/[a-z]+_[a-z]+/\d+/\d+.htm"

        # pattern = r'http://[a-z]+.cnstock.com/.*?/\d+.htm'
        # pattern = r'<a href=".*?\d+.htm"'

        # pattern = r'"http://.*?/\d+.htm"'
        # pattern = r'"http://(\.|[a-z]|/|,|-)+\d+.htm"'
        # pattern = r'"http://(\.|[a-z]|/|,|-)*\d+.htm"'


        pattern = '|'.join([pattern_1, pattern_2, pattern_3])
        urls = re.findall(pattern, html.text)
        # for ur in urls:
        #     log(ur)


        # log('数量', len(urls))
        return set(urls)

    def send_request(self, urls):
        news_list = []
        for url in urls:
            # 避免重复请求
            find_one = self.mgr.find_one('url', url)
            if find_one is not None:
                log_line('该URL已经存在 无需请求')
                log(url)
                continue

            news = self.get_newsinfo(url)

            if news == 'error' or news == 'timeout':
                continue

            news_list.append(news)
        return news_list


    def get_newsinfo(self, url):
        '''
        请求每一个新闻详情
        :param url:
        :return:
        '''
        t_sleep()
        log('当前访问的URL', url)


        try:
            html = requests.get(url, headers=self.get_news_header(), timeout=3)
            html.encoding = 'gbk'
        except Exception as e:
            log_line('访问出错')
            print(e)
            self.__class__.retry = 1

            return 'timeout'

        # log(html.text)

        response = etree.HTML(html.text)

        if html.status_code != 200:
            log('访问的URL出错!!!', url)
            return 'error'

        # self.parse_item(response)

        title, date, content = self.parse_item(response)
        news = News(title=title, date=date, content=content, url=url)
        news.spider_name = 'cnstock'
        return news

    def parse_item(self, response):

        try:
            title = response.xpath('//h1[@class="title"]/text()')
            title = ''.join(title).strip()
        except Exception as e:
            title = '未知'

        try:
            date = response.xpath('//span[@class="timer"]/text()')[0].split()[0]
        except Exception as e:
            date = '未知'
        try:
            con_list = response.xpath('//div[@id="qmt_content_div"]/descendant-or-self::*/text()')
        except Exception as e:
            con_list = ['未知']
        content = ''.join(con_list).strip()
        # log('content', content)
        return title, date, content


    def run(self):
        log_line('CnstockSpider 启动!!!')

        url = 'http://www.cnstock.com/'

        urls = self.get_html(url)

        news_list = self.send_request(urls)

        for news in news_list:
            self.mgr.insert(news)

        self.__class__().re_send()
Example #14
0
class ZqrbSpider(BaseSpider):
    def __init__(self):
        self.headers = {}
        self.date = self.get_date()
        self.mgr = MogoMgr()
        # self.retry = -1
        # self.retry_flag = -1
        self.failurls = []

    def get_date(self):
        year, month, day = get_today()
        date = str(year) + '-' + str(month) + '-' + str(day)
        return date

    def get_host(self, url):
        host = url.split('/')[2]
        return host

    def get_news_header(self):
        '''
        请求新闻列表的请求头与请求新闻详情的请求头不一样
        :return:
        '''
        return {
            'Host': 'epaper.zqrb.cn',
            'User-Agent': randomUserAgent(),
            'Pragma': 'no-cache',
            'Referer': 'http://epaper.zqrb.cn/',
        }

    def get_html(self, url):
        '''
        :param url:
        :return:
        '''
        html = requests.get(url, headers=self.get_news_header(), timeout=3)
        html.encoding = 'utf-8'

        html = etree.HTML(html.text)
        urls = html.xpath('//a[@class="vote_content12px"]/@href')

        new_urls = []
        for ur in urls:
            # log(self.parser_url(ur))
            new_urls.append(self.parser_url(ur))

        # log('数量', len(urls))
        return new_urls

    def parser_url(self, url):
        return self.get_base_url() + url

    def send_request(self, urls):
        news_list = []
        for url in urls:
            # 避免重复请求
            find_one = self.mgr.find_one('url', url)
            if find_one is not None:
                log_line('该URL已经存在 无需请求')
                log(url)
                continue

            news = self.get_newsinfo(url)

            if news == 'error':
                log('访问的新闻不存在 继续访问下一个URL')
                continue
            if news == 'timeout':
                log('访问的新闻超时 暂时跳过')
                continue

            news_list.append(news)
        return news_list

    def get_newsinfo(self, url):
        '''
        请求每一个新闻详情
        :param url:
        :return:
        '''

        t_sleep()
        log('当前访问的URL', url)

        header = self.get_news_header()

        try:
            html = requests.get(url, headers=header, timeout=3)
            html.encoding = 'utf-8'
        except Exception as e:
            log_line('访问出错')
            self.__class__.retry = 1
            print(e)
            return 'timeout'

        response = etree.HTML(html.text)

        if html.status_code != 200:
            log('访问的URL出错!!!', url)
            return 'error'

        # self.parse_item(response)

        title, date, content = self.parse_item(response)
        news = News(title=title, date=date, content=content, url=url)
        news.spider_name = 'zqrb'
        return news

    def parse_item(self, response):

        try:
            title = response.xpath('//td[@class="h1"]/text()')
            title = ''.join(title).strip()
        except Exception as e:
            title = '未知'

        date = self.date

        try:
            con_list = response.xpath(
                '//div[@id="ozoom"]/descendant-or-self::*/text()')
        except Exception as e:
            con_list = '未知'
        content = ''.join(con_list).strip()
        # log('content', content)
        return title, date, content

    def get_base_url(self):
        year, month, day = get_today()
        year = str(year)
        month = str(month) if month >= 10 else '0' + str(month)
        day = str(day) if day >= 10 else '0' + str(day)

        return 'http://epaper.zqrb.cn/html/{0}-{1}/{2}/'.format(
            year, month, day)

    def get_start_url(self):
        year, month, day = get_today()
        year = str(year)
        month = str(month) if month >= 10 else '0' + str(month)
        day = str(day) if day >= 10 else '0' + str(day)

        return 'http://epaper.zqrb.cn/html/{0}-{1}/{2}/node_2.htm'.format(
            year, month, day)

    def run(self):
        log_line('ZqrbSpider 启动!!!')

        url = self.get_start_url()
        urls = self.get_html(url)
        news_list = self.send_request(urls)

        for news in news_list:
            self.mgr.insert(news)

        self.__class__().re_send()
Example #15
0
class BjjrjSpider(BaseSpider):
    def __init__(self):
        self.headers = {}
        self.mgr = MogoMgr()

    def get_news_header(self):
        '''
        请求新闻列表的请求头与请求新闻详情的请求头不一样
        :return:
        '''
        return {
            'Host': 'www.bjjrj.gov.cn',
            'User-Agent': randomUserAgent(),
            'Pragma': 'no-cache',
        }

    def get_html(self, url):
        '''
        :param url:
        :return:
        '''
        html = requests.get(url)
        html.encoding = 'utf-8'
        html = etree.HTML(html.text)
        urls = html.xpath('//div[@class="erjiUL-word"]/a/@href')

        return self.parser_url(urls, url)

    def parser_url(self, urls, originalurl):
        base_url = originalurl.rsplit('/', 1)[0]
        new_urls = []
        for url in urls:
            url = base_url + '/' + url
            new_urls.append(url)
        return new_urls

    def send_request(self, urls):
        news_list = []
        for url in urls:
            # 避免重复请求
            find_one = self.mgr.find_one('url', url)
            if find_one is not None:
                log_line('该URL已经存在 无需请求')
                log(url)
                continue
            news = self.get_newsinfo(url)
            if news == 'timeout' or news == 'error':
                log_line('timeout error')
                continue

            news_list.append(news)
        return news_list

    def get_newsinfo(self, url):
        '''
        请求每一个新闻详情
        :param url:
        :return:
        '''
        t_sleep()
        log('当前访问的URL', url)

        try:
            html = requests.get(url, headers=self.get_news_header(), timeout=3)
            html.encoding = 'utf-8'
        except Exception as e:
            log_line('访问出错')
            print(e)
            self.__class__.retry = 1

            return 'timeout'

        if html.status_code != 200:
            log_line('请求状态不是200')
            return 'error'

        response = etree.HTML(html.text)
        self.parse_item(response)

        title, date, content = self.parse_item(response)
        news = News(title=title, date=date, content=content, url=url)
        news.spider_name = 'bjjrj'
        return news

    def parse_item(self, response):
        title = response.xpath('//div[@class="article"]/h1/text()')[0].strip()
        date = response.xpath(
            '//h5[@class="articleTitleSub"]/text()')[-1].split(':')[1]
        con_list = response.xpath(
            '//div[@id="zoom"]/descendant-or-self::*/text()')
        content = ''.join(con_list).strip()

        return title, date, content

    def run(self):
        log_line('BjjrjSpider 启动!!!')

        urls = []
        url = 'http://www.bjjrj.gov.cn/zcfg/c19-list-1.html'
        urls_1 = self.get_html(url)
        url = 'http://www.bjjrj.gov.cn/zyzc/c138-list-1.html'
        urls_2 = self.get_html(url)

        urls.extend(urls_1)
        urls.extend(urls_2)

        news_list = self.send_request(urls)

        log_line(len(news_list))

        for news in news_list:
            self.mgr.insert(news)

        self.__class__().re_send()
Example #16
0
class CsSpider(BaseSpider):

    def __init__(self):
        self.headers = {}
        self.mgr = MogoMgr()

    def get_host(self, url):
        host = url.split('/')[2]
        return host

    def get_news_header(self):
        '''
        请求新闻列表的请求头与请求新闻详情的请求头不一样
        :return:
        '''
        return {
            # 'Host': '',
            'User-Agent': randomUserAgent(),
            'Pragma': 'no-cache',
            'Referer': 'http://www.cs.com.cn/',
        }


    def get_html(self, url):
        '''
        :param url:
        :return:
        '''
        html = requests.get(url)
        html.encoding = 'gbk'


        # log(html.text)

        pattern = r"\./*[a-z]*/*[a-z]*/[a-z]+/\d+/t\d+_\d+.html"

        urls = re.findall(pattern, html.text)

        new_urls = []
        for ur in  urls:
            new_urls.append(self.parser_url(ur))
        # log(new_urls)

        return new_urls


    def parser_url(self, url):
        log(url)
        return 'http://www.cs.com.cn' + url[1:]

    def send_request(self, urls):
        news_list = []
        for url in urls:
            # 避免重复请求
            find_one = self.mgr.find_one('url', url)
            if find_one is not None:
                log_line('该URL已经存在 无需请求')
                log(url)
                continue

            news = self.get_newsinfo(url)

            if news == 'error' or news == 'timeout':
                continue

            news_list.append(news)
        return news_list


    def get_newsinfo(self, url):
        '''
        请求每一个新闻详情
        :param url:
        :return:
        '''
        t_sleep()
        log('当前访问的URL', url)


        try:
            html = requests.get(url, headers=self.get_news_header(), timeout=3)
            html.encoding = 'gbk'
        except Exception as e:
            log_line('访问出错')
            print(e)
            self.__class__.retry = 1

            return 'timeout'


        # log(html.text)

        response = etree.HTML(html.text)


        if html.status_code != 200:
            log('访问的URL出错!!!', url)
            return 'error'

        # self.parse_item(response)

        title, date, content = self.parse_item(response)
        news = News(title=title, date=date, content=content, url=url)
        news.spider_name = 'cs'

        return news

    def parse_item(self, response):

        try:
            title = response.xpath('//div[@class="artical_t"]/h1/text()')[0].strip()
        except Exception as e:
            title = '未知'
        try:
            date = response.xpath('//span[@class="Ff"]/text()')[0].split()[0]
        except Exception as e:
            date = response.xpath('//span[@class="ctime01"]/text()')[0].split()[0]


        try:
            con_list = response.xpath('//div[@class="artical_c"]/descendant-or-self::*/text()')
        except Exception as e:
            con_list = '未知'

        # contents = [re.sub(r'[a-z]+|\s+', '', cc) for cc in con_list]

        content = ''.join(con_list).strip()
        # log('content', content)
        return title, date, content

    def run(self):
        log_line('CsSpider 启动!!!')

        start_urls = [
            'http://www.cs.com.cn/',
        ]

        for url in  start_urls:
            urls = self.get_html(url)
            news_list = self.send_request(urls)

            for news in news_list:
                self.mgr.insert(news)

        self.__class__().re_send()
Example #17
0
class AmacSpider(BaseSpider):
    def __init__(self):
        self.headers = {}
        self.mgr = MogoMgr()
        self.newslist = []
        self.retry = -1
        self.retry_flag = -1
        self.failurls = []
        self.start_urls = [
            'http://www.amac.org.cn/flfg/flfgwb/',
        ]

    def get_news_header(self):
        '''
        请求新闻列表的请求头与请求新闻详情的请求头不一样
        :return:
        '''
        return {
            'Host': 'www.amac.org.cn',
            'User-Agent': randomUserAgent(),
            'Pragma': 'no-cache',
        }

    def get_html(self, url):
        '''
        :param url:
        :return:
        '''

        html = requests.get(url, headers=self.get_news_header())
        html.encoding = 'utf-8'
        html = etree.HTML(html.text)
        items = html.xpath('//div[@class="newsTrTitle"]/a')

        # log_line(len(items))

        for item in items:
            self.parser_item(item)

    def parser_item(self, item):
        news = News()
        news.spider_name = 'amac'
        news.url = self.parser_url(
            item.xpath('./@href')[0], 'http://www.amac.org.cn')
        news.title = item.xpath('./text()')[0]

        self.newslist.append(news)

    def parser_url(self, url, base_url):
        return base_url + url.split('../..')[1]

    def get_newsUrls(self):
        return [news.url for news in self.newslist]

    def send_request(self, urls):

        for url in urls:
            # 避免重复请求
            find_one = self.mgr.find_one('url', url)
            if find_one is not None:
                log_line('该URL已经存在 无需请求')
                log(url)
                continue
            date, content = self.parser_data(url)
            if content in ('error', 'timeout'):
                continue
            self.update_news(url, content, date)

    def parser_data(self, url):
        '''
        请求每一个新闻详情
        :param url:
        :return:
        '''
        t_sleep()
        log('当前访问的URL', url)

        try:
            html = requests.get(url, headers=self.get_news_header(), timeout=3)
            html.encoding = 'utf-8'
        except Exception as e:
            log_line('访问出错')
            print(e)
            self.__class__.retry = 1
            return 'timeout', 'timeout'

        if html.status_code != 200:
            return 'error', 'error'

        response = etree.HTML(html.text)

        con_list = response.xpath(
            '//div[@class="ldContent"]/descendant-or-self::*/text()')
        content = ''.join(con_list).strip()

        date = response.xpath('//div[@class="ldDate"]/text()')[0]
        date = date.split(':')[1]
        # log('内容', content)
        return date, content

    def update_news(self, url, content, date):
        for news in self.newslist:
            if news.url == url:
                news.content = content
                news.date = date

    def run(self):
        log_line('AmacSpider 启动!!!')

        for url in self.start_urls:
            self.get_html(url)
            self.send_request(self.get_newsUrls())

            for news in self.newslist:
                find_one = self.mgr.find_one('url', news.url)
                if find_one is not None:
                    log_line('该URL已经存在 无需写入')
                    log(news.url)
                    continue
                self.mgr.insert(news)

        self.__class__().re_send()
Example #18
0
class Circ2Spider(BaseSpider):
    def __init__(self):
        self.headers = {}
        self.mgr = MogoMgr()
        self.newslist = []
        self.start_urls = [
            'http://www.gov.cn/pushinfo/v150203/base_14px_pubdate.htm',
        ]

    def get_news_header(self):
        return {
            # 'Host': 'www.gov.cn',
            'User-Agent': randomUserAgent(),
            'Pragma': 'no-cache',
            'Referer': 'http://www.circ.gov.cn/web/site0/tab7642/',
        }

    def get_html(self, url):
        '''
        :param url:
        :return:
        '''

        t_sleep()
        # log('当前访问的URL', url)

        html = requests.get(url, headers=self.get_news_header(), timeout=3)
        html.encoding = 'utf-8'

        # try:
        #     html = requests.get(url, headers=self.get_news_header(), timeout=3)
        #     html.encoding = 'utf-8'
        # except Exception as e:
        #     log_line('访问出错')
        #     print(e)
        #     self.__class__.retry = 1
        #
        #     return 'timeout'

        # if html.status_code != 200:
        #     return 'error'

        html = etree.HTML(html.text)
        items = html.xpath('//ul[@class="list"]/li')

        # log(len(items))

        for item in items:
            self.parser_item(item)

    def parser_item(self, item):
        url = item.xpath('./a/@href')[0]
        date = item.xpath('./span/text()')[0]

        news = News()
        news.spider_name = 'circ'
        news.url = self.parser_url(url, 'http://www.gov.cn')
        news.title = item.xpath('./a/text()')[0]
        news.date = date

        # log(news.url, news.title, news.date)

        self.newslist.append(news)

    def parser_url(self, url, base_url):
        if str(url).startswith('http'):
            return url
        else:
            return base_url + url

    def get_newsUrls(self):
        return [news.url for news in self.newslist]

    def run(self):

        log_line('Circ2Spider 启动!!!')

        for url in self.start_urls:
            self.get_html(url)

            for news in self.newslist:
                find_one = self.mgr.find_one('url', news.url)
                if find_one is not None:
                    log_line('该URL已经存在 无需写入')
                    log(news.url)
                    continue
                self.mgr.insert(news)

        self.__class__().re_send()