Exemple #1
0
    def get_newsinfo(self, url):
        '''
        请求每一个新闻详情
        :param url:
        :return:
        '''
        t_sleep()
        log('当前访问的URL', url)

        try:
            html = requests.get(url, headers=self.get_news_header(), timeout=3)
            html.encoding = 'utf-8'
        except Exception as e:
            log_line('访问出错')
            print(e)
            self.__class__.retry = 1

            return 'timeout'

        if html.status_code != 200:
            log_line('请求状态不是200')
            return 'error'

        response = etree.HTML(html.text)
        self.parse_item(response)

        title, date, content = self.parse_item(response)
        news = News(title=title, date=date, content=content, url=url)
        news.spider_name = 'bjjrj'
        return news
Exemple #2
0
    def get_iteminfo(self, url):
        '''
        访问每一条新闻详情
        :param itemlist: 新闻链接集合
        :return: 新闻model
        '''
        t_sleep()

        log('当前访问的URL', url)


        try:
            html = requests.get(url, headers=self.get_news_header(), timeout=3)
            html.encoding = 'utf-8'
        except Exception as e:
            log_line('访问出错')
            print(e)
            return 'timeout'

        if html.status_code != 200:
            log('访问的URL出错!!!', url)
            return 'error'

        response = etree.HTML(html.text)
        title, date, content = self.parse_item(response)

        news = News(title=title, date=date, content=content, url=url)
        news.spider_name = 'xinhua'
        return news
Exemple #3
0
    def get_newsinfo(self, url, parser_item_fuc):
        '''
        请求每一个新闻详情
        '''
        t_sleep()

        log('当前访问的URL', url)


        html = self.get_html(url)
        if html == 'timeout':
            return 'error'

        response = etree.HTML(html.text)
        log('当前访问的URL', url, html.status_code)

        if html.status_code != 200:
            log('访问的URL出错!!!', url)
            return 'error'

        # parser_item_fuc(response)

        title, date, content = parser_item_fuc(response)
        news = News(title=title, date=date, content=content, url=url)
        news.spider_name = 'pbc'
        return news
Exemple #4
0
    def parser_item(self, item):
        news = News()
        news.spider_name = 'amac'
        news.url = self.parser_url(
            item.xpath('./@href')[0], 'http://www.amac.org.cn')
        news.title = item.xpath('./text()')[0]

        self.newslist.append(news)
Exemple #5
0
    def get_newsinfo(self, url):
        '''
        请求每一个新闻详情
        :param url:
        :return:
        '''
        t_sleep()

        try:
            html = requests.get(url, headers=self.get_news_header(), timeout=3)
            html.encoding = 'gbk'
        except Exception as e:
            log_line('访问出错')
            print(e)
            return 'timeout'




        response = etree.HTML(html.text)
        log('当前访问的URL', url)

        title, date, content = self.parse_item(response)
        news = News(title=title, date=date, content=content, url=url)
        return news
Exemple #6
0
 def select_url(self):
     try:
         connect = get_connect()
         cursor = connect.cursor()
         print("connection")
         sql = "SELECT id,source_url FROM toutiao_news where id >(select min(t.id) from (select id from toutiao_news where content is null order by id) t) order by id desc "
         cursor.execute(sql)
         result = cursor.fetchall()
         for row in result:
             # print(row[0])
             news = News()
             news.id = row[0]
             news.source_url = row[1]
             self.news_list.append(news)
     except Exception as e:
         print(e)
Exemple #7
0
    def parse_item(self, response, url):

        try:
            title = (response.xpath('//h2[@class="titl"]/text()'))[0].strip()
        except Exception as e:
            title = '未知'

        try:
            date = (response.xpath('//p[@class="Wh"]/span[1]/text()')
                    )[0].strip().split()[0]
            date = str(arrow.get(date)).split('T')[0]
        except Exception as e:
            date = '未知'

        try:
            con_list = response.xpath('//div[@class="detailCont"]/p')
            content = self.pasre_content(con_list)
        except Exception as e:
            content = '未知'

        item = News()
        item.title = title
        item.date = date
        item.content = content
        item.url = url
        item.spider_name = 'jingji'

        return item
Exemple #8
0
def select_url():
    arrList = []
    try:
        connect = get_connect()
        cursor = connect.cursor()
        print("connection")
        sql = "SELECT id,source_url FROM toutiao_news WHERE id > 15855"
        cursor.execute(sql)
        result = cursor.fetchall()
        for row in result:
            # print(row[0])
            news = News()
            news.id = row[0]
            news.source_url = row[1]
            arrList.append(news)
    except Exception as e:
        print(e)
    finally:
        return arrList
def toutiao_news_api(url):
    # 代理服务器
    proxyHost = "http-dyn.abuyun.com"
    proxyPort = "9020"

    # 代理隧道验证信息
    proxyUser = "******"
    proxyPass = "******"

    proxyMeta = "http://%(user)s:%(pass)s@%(host)s:%(port)s" % {
        "host": proxyHost,
        "port": proxyPort,
        "user": proxyUser,
        "pass": proxyPass,
    }

    proxies = {
        "http": proxyMeta,
        "https": proxyMeta,
    }
    ua = UserAgent(verify_ssl=False)
    headers = {
        'cookie':
        'tt_webid=6825236887406953998; s_v_web_id=verify_ka17kc91_J51hfIgB_1Ujy_4F87_AQ77_v44SCeaZdYbb; WEATHER_CITY=%E5%8C%97%E4%BA%AC; __tasessionId=ftj73c94a1589124278466; tt_webid=6825236887406953998; csrftoken=3bc73a541ff3c196706a5fa652baa10a; ttcid=93c87bb6d2c44204a824c060f2a0344b39; SLARDAR_WEB_ID=167cd898-158d-4682-84b7-515f808f9c49; tt_scid=nvrgh8BUDb5bfXypX.EbNgFcMiVjrSr7vdwnPAab2w2tEn2I8DLcdmqRb2aAGGvT6b9b',
        'user-agent': ua.random,
        'x-requested-with': 'XMLHttpRequest'
    }
    toutiao_data = requests.get(url, headers=headers, proxies=proxies).text
    global data
    data = json.loads(toutiao_data)
    global max_behot_time
    max_behot_time = data['next']['max_behot_time']
    items = data['data']

    news_list = []
    link_head = 'http://toutiao.com'

    for n in items:
        if 'title' in n and n['tag'] != 'ad' and n['tag'] != 'news_media':
            news = News()
            news.title = n['title']
            print(news.title)
            news.tag = n['tag']
            news.source = n['source']
            # 转换成localtime
            time_local = time.localtime(n['behot_time'])
            # 转换成新的时间格式(2016-05-05 20:28:54)
            dt = time.strftime("%Y-%m-%d %H:%M:%S", time_local)
            news.news_date = dt
            print(news.news_date)
            news.source_url = link_head + n['source_url']

            news_list.append(news)
            #print(news.title, news.source_url, news.source, news.keyword, news.keywords)

    return news_list
Exemple #10
0
 def parser_item(self, item):
     news = News()
     news.spider_name = 'mohurd'
     news.url = item.xpath('./@href')[0]
     news.title = item.xpath('./text()')[0]
     news.date = item.getparent().getnext().xpath(
         './text()')[0][1:-1].replace('.', '-').strip()
     self.newslist.append(news)
Exemple #11
0
    def parser_item(self, item):
        url = item.xpath('./li[@class="mc"]/div/a/@href')[0]
        date = item.xpath('./li[@class="fbrq"]/text()')[0]

        news = News()
        news.spider_name = 'csrc'
        news.url = self.parser_url(url, 'http://www.csrc.gov.cn/pub/zjhpublic')
        news.title = item.xpath('./li[@class="mc"]/div/a/text()')[0]
        news.date = arrow.get(date).format('YYYY-MM-DD')

        # log(news.url, news.title, news.date)
        self.newslist.append(news)
Exemple #12
0
    def parser_item(self, item):
        url = item.xpath('./a/@href')[0]
        date = item.xpath('./span/text()')[0]

        news = News()
        news.spider_name = 'circ'
        news.url = self.parser_url(url, 'http://www.gov.cn')
        news.title = item.xpath('./a/text()')[0]
        news.date = date

        # log(news.url, news.title, news.date)

        self.newslist.append(news)
Exemple #13
0
    def parser_item(self, item):
        url = item.xpath('./a/@href')[0]
        if 'search' in url:
            return

        date = item.getnext().xpath('./text()')[0][1:-1]

        news = News()
        news.spider_name = 'circ'
        news.url = self.parser_url(url, 'http://www.circ.gov.cn')
        news.title = item.xpath('./a/text()')[0]
        news.date = date

        # log(news.url, news.title, news.date)

        self.newslist.append(news)
Exemple #14
0
    def get_html(self, url):
        html = requests.get(url, headers=self.get_news_header())
        html.encoding = 'utf-8'
        html = etree.HTML(html.text)
        items = html.xpath('//a[@class="STYLE8"]')

        for item in items:
            news = News()
            news.spider_name = 'cbrc'
            news.url = item.xpath('./@href')[0]
            news.title = item.xpath('./@title')[0]
            news.date = item.getparent().getnext().xpath('./text()')[0].strip()

            self.newslist.append(news)

        return self.parser_url(self.newslist)
def keyword_search(keyword):

    source_url_list = select_source_url_returnset()

    url = 'http://www.toutiao.com/search_content/?offset=0&format=json&keyword= ' + keyword + '&autoload=true&count=200&cur_tab=1'

    toutiao_data = requests.get(url).text

    data = json.loads(toutiao_data)
    items = data['data']

    news_list = []
    link_head = 'http://toutiao.com'

    for n in items:
        if 'title' in n:
            news = News()
            news.title = n['title']
            news.tag = n['tag']
            news.source = n['source']
            news.source_url = link_head + n['source_url']
            # 两会关键词
            news.keyword = keyword
            # 今日头条自带关键词
            news.keywords = n['keywords']

            #如果已经存在source_url则跳过
            if news.source_url in source_url_list:
                print('数据库已有该记录!')
                continue

            print('新添加记录:', news.title)
            news_list.append(news)
            # print(news.title, news.source_url, news.source, news.keyword, news.keywords)

    return news_list