Exemple #1
0
    def __init__(self, *args, **kwargs):
        # 与BDsearchUrlUtil交互要用的参数,指明网址
        self.site = 'news.qq.com'

        # 需要在parse()中使用该url关联的'keyword'(自定义item属性),
        # 当然 scrapy.response 对象中是没有的,
        # 也不想改写个 response 对象的子类了,直接定义一个类属性
        self.keyword = ''

        self.bd = BDsearchUrlUtil()
        self.slog = SpiderLogUtil()

        super().__init__(*args, **kwargs)
Exemple #2
0
    def __init__(self, *args, **kwargs):

        self.slog = SpiderLogUtil()

        super().__init__(*args, **kwargs)
Exemple #3
0
class WeiboSpider(Spider):
    name = "weibo_spider"
    hotbase_url = "https://weibo.cn/search/mblog?" \
               "hideSearchFrame=&keyword=#" \
               "&advancedfilter=1&sort=hot&page="
    base_url = "https://weibo.cn/search/mblog?" \
               "keyword=#" \
               "&sort=time&page="
    custom_settings = {
        # 请将Cookie替换成你自己的Cookie
         'CONCURRENT_REQUESTS': 16,
        'DOWNLOAD_DELAY': 3,
        'COOKIES_ENABLED':False,
        'DEFAULT_REQUEST_HEADERS' : {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36',
            'Cookie':'_T_WM=f7fc1975334e2610dd77c4a949caaa2e; __guid=78840338.2690867225963806000.1561098303'
                     '394.3926; TMPTOKEN=xWotEi1ho4BsQadI1WKh50PW3wxeD0MXriFaU01sHfs7ddDfYkc6g8QC0brRQ2iI; SUB=_2A25'
                     'wCAggDeRhGeBM6lER8yzJzD2IHXVT8qhorDV6PUJbkdAKLUnFkW1NRRuUuWjxLqOlReVo19AJKlLVFf0K-Qcb; SUHB=0h1J3'
                     'h4MnoA8ye; SCF=AqNspA5hvpJAB-QOIpSEFOvS7uTz2C-xcjU2d4im-izONxHBJbLovO6aDcPk7st0qIDcNhWWOxTPgrhwE'
                     'NoLpoA.; SSOLoginState=1561098352; monitor_count=2'
        }
    }


    def __init__(self, *args, **kwargs):

        self.slog = SpiderLogUtil()

        super().__init__(*args, **kwargs)

    def close(self, reason):

        self.slog.spider_finish(self)
        super().close(self, reason)

    def start_requests(self):

        querystr = getattr(self, 'q', '中美贸易')
        self.querystr=querystr
        times = getattr(self, 't', 3)
        self.querystr = querystr
        self.times=times

        self.slog.spider_finish(self)

        # #此处设置微博图片保存路径
        # folderpath = "e:\weibo" +querystr
        # if (not os.path.exists(folderpath)):
        #     os.mkdir(folderpath)
        # folderpath = "e:\weibo"
        # if (not os.path.exists(folderpath)):
        #     os.mkdir(folderpath)


        self.q=[]
        self.base_url=self.base_url.replace("#",querystr)
        self.hotbase_url = self.hotbase_url.replace("#", querystr)
        print("开始爬取微博,关键字为"+self.querystr+"第"+str(self.times)+"次")
        yield Request(url=self.hotbase_url+"1", callback=self.parse_tweet)
        yield Request(url=self.base_url + "1", callback=self.parse_tweet)



    # def parse_url(self,response):
    #     if  response.url.endswith('page=1'):
    #         # 如果是第1页,一次性获取后面的所有页
    #         all_page = re.search(r' 1/(\d+)页', response.text)
    #         if all_page:
    #             all_page = all_page.group(1)
    #             all_page = int(all_page)
    #             print('获取到了页数',all_page)
    #             if all_page>=99:
    #                 all_page=99
    #             for page_num in range(2,3):
    #                 page_url = response.url.replace(
    #                     'page=1', 'page={}'.format(page_num))
    #                 yield Request(url=page_url, callback=self.parse_url,
    #                               dont_filter=True, meta=response.meta)
    #     """
    #     解析本页的数据
    #     """
    #     tree_node = etree.HTML(response.body)
    #     tweet_nodes = tree_node.xpath('//div[@class="c" and @id]')
    #     for tweet_node in tweet_nodes:
    #             tweet_repost_url = tweet_node.xpath(
    #                 './/a[contains(text(),"转发[")]/@href')[0]
    #             user_tweet_id = re.search(
    #                 r'/repost/(.*?)\?uid=(\d+)', tweet_repost_url)
    #             weibo_url = 'https://weibo.com/{}/{}'.format(user_tweet_id.group(2),user_tweet_id.group(1))
    #             yield Request(url=weibo_url,callback= self.parse_details,
    #                           dont_filter=True, meta=response.meta,args={'wait': 2})

    def parse_details(self,response):
        body = response.body
        body = body.decode("utf-8")
        # print(body)
        selector =Selector(text=body)
        headandname = selector.xpath('//div[@class="face"]/a[@class="W_face_radius"]')[0]
        head=headandname.xpath("./img/@src").get()
        author_name=headandname.xpath("./@title").get()
        author_url=headandname.xpath("./@href").get()

        timeandfrom=selector.xpath('//div[@class="WB_from S_txt2"]')[0]
        posttime=timeandfrom.xpath("./a")[0].xpath("./@title").get()
        timeArray = time.strptime(posttime, "%Y-%m-%d %H:%M")
        created_at =time.strftime("%Y-%m-%d-%H-%M-%S", timeArray)
        crawl_time= str(int(time.time()))
        tool=""
        if len(timeandfrom.xpath("./a"))>1:
            tool=timeandfrom.xpath("./a")[1].xpath("./text()").get()

        content1 =selector.xpath('//div[@class="WB_text W_f14"]')
        content =content1.get()
        p = re.sub(r'<.*?>', '', content)
        content = re.sub(r'  ', '', p).strip()
        location1=''
        location1 =content1.xpath('./a/i[@class="W_ficon ficon_cd_place"]')
        if location1:
            location=location1.xpath('../@title').get()
            content=content.rstirp(location)
            print(content,location)









    def parse_information(self, response):
        """ 抓取个人信息 """
        body = response.body
        body = body.decode("utf-8")
        selector = Selector(text=body)
        head_url = selector.xpath('//img[@alt="头像"]//@src').get()
        information_item = InformationItem()
        information_item["head"] =head_url
        information_item['crawl_time'] = int(time.time())
        selector = Selector(response)
        information_item['_id'] = response.meta['id']

        # 获取标签里的所有text()
        text1 = ";".join(selector.xpath('body/div[@class="c"]//text()').extract())
        nick_name = re.findall('昵称;?[::]?(.*?);', text1)
        gender = re.findall('性别;?[::]?(.*?);', text1)
        place = re.findall('地区;?[::]?(.*?);', text1)
        briefIntroduction = re.findall('简介;?[::]?(.*?);', text1)
        birthday = re.findall('生日;?[::]?(.*?);', text1)
        sex_orientation = re.findall('性取向;?[::]?(.*?);', text1)
        sentiment = re.findall('感情状况;?[::]?(.*?);', text1)
        vip_level = re.findall('会员等级;?[::]?(.*?);', text1)
        authentication = re.findall('认证;?[::]?(.*?);', text1)
        labels = re.findall('标签;?[::]?(.*?)更多>>', text1)
        if nick_name and nick_name[0]:
            information_item["nick_name"] = nick_name[0].replace(u"\xa0", "")
        if gender and gender[0]:
            information_item["gender"] = gender[0].replace(u"\xa0", "")
        if place and place[0]:
            place = place[0].replace(u"\xa0", "").split(" ")
            information_item["province"] = place[0]
            if len(place) > 1:
                information_item["city"] = place[1]
        if briefIntroduction and briefIntroduction[0]:
            information_item["brief_introduction"] = \
                briefIntroduction[0].replace(u"\xa0", "")
        if birthday and birthday[0]:
            information_item['birthday'] = birthday[0]
        if sex_orientation and sex_orientation[0]:
            if sex_orientation[0].replace(u"\xa0", "") == gender[0]:
                information_item["sex_orientation"] = "同性恋"
            else:
                information_item["sex_orientation"] = "异性恋"
        if sentiment and sentiment[0]:
            information_item["sentiment"] = sentiment[0].replace(u"\xa0", "")
        if vip_level and vip_level[0]:
            information_item["vip_level"] = vip_level[0].replace(u"\xa0", "")
        if authentication and authentication[0]:
            information_item["authentication"] = authentication[0].replace(u"\xa0", "")
        if labels and labels[0]:
            information_item["labels"] = \
                labels[0].replace(u"\xa0", ",").replace(';', '').strip(',')
        #yield information_item
        request_meta = response.meta
        request_meta['item'] = information_item
        yield Request("https://weibo.cn/u/"+information_item['_id'],
                      callback=self.parse_further_information,
                      meta=request_meta, dont_filter=True, priority=1)

    def parse_further_information(self, response):
        text = response.text
        # print(text)
        information_item = response.meta['item']
        tweets_num = re.findall('微博\[(\d+)\]', text)
        if tweets_num:
            information_item['tweets_num'] = int(tweets_num[0])
        follows_num = re.findall('关注\[(\d+)\]', text)
        if follows_num:
            information_item['follows_num'] = int(follows_num[0])
        fans_num = re.findall('粉丝\[(\d+)\]', text)
        if fans_num:
            information_item['fans_num'] = int(fans_num[0])
        yield information_item
        # # 获取该用户微博
        # yield Request(url=self.base_url +
        #                   '/{}/profile?page=1'.format(information_item['_id']),
        #               callback=self.parse_tweet,
        #               priority=1)
        #
        # # 获取关注列表
        # yield Request(url=self.base_url +
        #                   '/{}/follow?page=1'.format(information_item['_id']),
        #               callback=self.parse_follow,
        #               dont_filter=True)
        # # 获取粉丝列表
        # yield Request(url=self.base_url +
        #                   '/{}/fans?page=1'.format(information_item['_id']),
        #               callback=self.parse_fans,
        #               dont_filter=True)


    def parse_tweet(self, response):
        # body = response.body
        # body = body.decode("utf-8")
        # print(body)
        if  response.url.endswith('page=1'):
            # 如果是第1页,一次性获取后面的所有页
            all_page = re.search(r'&nbsp;1/(\d+)页', response.text)
            if all_page:

                all_page = all_page.group(1)
                all_page = int(all_page)
                print('获取到了页数',all_page)
                if all_page>=99:
                    all_page=99

                for page_num in range(2,30):
                    page_url = response.url.replace(
                        'page=1', 'page={}'.format(page_num))
                    yield Request(page_url, self.parse_tweet,
                                  dont_filter=True, meta=response.meta)
        """
        解析本页的数据
        """
        tree_node = etree.HTML(response.body)
        tweet_nodes = tree_node.xpath('//div[@class="c" and @id]')
        for tweet_node in tweet_nodes:
            try:
                tweet_item = TweetsItem()
                tweet_item['keyword']=self.querystr
                tweet_item['crawl_time'] =[]
                tweet_item['crawl_time'].append(str(int(time.time())))
                tweet_repost_url = tweet_node.xpath(
                    './/a[contains(text(),"转发[")]/@href')[0]
                user_tweet_id = re.search(
                    r'/repost/(.*?)\?uid=(\d+)', tweet_repost_url)
                tweet_item['weibo_url'] = \
                    'https://weibo.com/{}/{}'.format(user_tweet_id.group(2),
                                                    user_tweet_id.group(1))
                tweet_item['user_id'] = user_tweet_id.group(2)
                tweet_item['_id'] = '{}_{}'.format(user_tweet_id.group(2),
                                                   user_tweet_id.group(1))


                create_time_info_node = tweet_node.xpath('.//span[@class="ct"]')[-1]
                create_time_info = create_time_info_node.xpath('string(.)')
                if "来自" in create_time_info:
                    tweet_item['created_at'] = \
                        time_fix(create_time_info.split('来自')[0].strip())
                    tweet_item['tool'] = create_time_info.split('来自')[1].strip()
                else:
                    tweet_item['created_at'] = time_fix(create_time_info.strip())



                like_num = tweet_node.xpath('.//a[contains(text(),"赞[")]/text()')[-1]
                tweet_item['like_num'] = []
                tweet_item['like_num'].append( int(re.search('\d+', like_num).group()))
                repost_num = tweet_node.xpath('.//a[contains(text(),"转发[")]/text()')[-1]
                tweet_item['repost_num'] = []
                tweet_item['repost_num'].append(int(re.search('\d+', repost_num).group()))
                comment_num = tweet_node.xpath(
                    './/a[contains(text(),"评论[") '
                    'and not(contains(text(),"原文"))]/text()')[-1]
                tweet_item['comment_num'] = []
                tweet_item['comment_num'].append(int(re.search('\d+', comment_num).group()))
                images = tweet_node.xpath('.//img[@alt="图片"]/@src')
                if images:
                    tweet_item['image_url'] = images[0]

                videos = tweet_node.xpath('.//a[contains(@href,'
                                          '"https://m.weibo.cn/s/video/show?object_id=")]'
                                          '/@href')
                if videos:
                    tweet_item['video_url'] = videos[0]

                map_node = tweet_node.xpath('.//a[contains(text(),"显示地图")]')
                if map_node:
                    map_node = map_node[0]
                    map_node_url = map_node.xpath('./@href')[0]
                    map_info = re.search(r'xy=(.*?)&', map_node_url).group(1)
                    tweet_item['location_map_info'] = map_info
                    tweet_item['location'] = \
                        map_node.xpath('./preceding-sibling::a/text()')[0]

                repost_node = tweet_node.xpath('.//a[contains(text(),"原文评论[")]/@href')
                if repost_node:
                    tweet_item['origin_weibo'] = repost_node[0]

                # 检测由没有阅读全文:
                # all_content_link =
                #   tweet_node.xpath('.//a[text()="全文" and contains(@href,"ckAll=1")]')
                # if all_content_link:
                #     all_content_url =
                #       self.base_url + all_content_link[0].xpath('./@href')[0]
                #     yield Request(all_content_url,
                #                   callback=self.parse_all_content,
                #                   meta={'item': tweet_item},
                #                   priority=1)
                #
                # else:
                tweet_html = etree.tostring(tweet_node, encoding='unicode')
                tweet_item['content'] = extract_weibo_content(tweet_html)
                if not tweet_item['_id'] in self.q:
                    self.q.append(tweet_item['_id'])
                    yield tweet_item

                if self.times==0:
                    # 抓取该微博的评论信息
                    comment_url = 'https://weibo.cn/comment/hot/' \
                                  + tweet_item['weibo_url'].split('/')[-1] + '?rl=2'
                    # print(comment_url)
                    yield Request(url=comment_url,
                                  callback=self.parse_comment,
                                  meta={'weibo_url': tweet_item['weibo_url']})

            except Exception as e:
                self.logger.error(e)

    def parse_all_content(self, response):
        # 有阅读全文的情况,获取全文
        body = response.body

        body = body.decode("utf-8", "ignore")
        # print(body)

        response.replace(body=body)
        tree_node = etree.HTML(response.body)
        tweet_item = response.meta['item']
        content_node = tree_node.xpath('//*[@id="M_"]/div[1]')[0]
        tweet_html = etree.tostring(content_node, encoding='unicode')
        tweet_item['content'] = extract_weibo_content(tweet_html)
        self.q.append(tweet_item[''])
        yield tweet_item

    def parse_follow(self, response):
        """
        抓取关注列表
        """
        # 如果是第1页,一次性获取后面的所有页
        if response.url.endswith('page=1'):
            all_page = re.search(r'/>&nbsp;1/(\d+)页</div>', response.text)
            if all_page:
                all_page = all_page.group(1)
                all_page = int(all_page)
                for page_num in range(2, all_page + 1):
                    page_url = response.url.replace('page=1', 'page={}'.format(page_num))
                    yield Request(page_url, self.parse_follow,
                                  dont_filter=True, meta=response.meta)
        selector = Selector(response)
        urls = selector.xpath('//a[text()="关注他" or text()="关注她" '
                              'or text()="取消关注"]/@href').extract()
        uids = re.findall('uid=(\d+)', ";".join(urls), re.S)
        ID = re.findall('(\d+)/follow', response.url)[0]
        for uid in uids:
            relationships_item = RelationshipsItem()
            relationships_item['crawl_time'] = int(time.time())
            relationships_item["fan_id"] = ID
            relationships_item["followed_id"] = uid
            relationships_item["_id"] = ID + '-' + uid
            yield relationships_item

    def parse_fans(self, response):
        """
        抓取粉丝列表
        """
        # 如果是第1页,一次性获取后面的所有页
        if response.url.endswith('page=1'):
            all_page = re.search(r'/>&nbsp;1/(\d+)页</div>', response.text)
            if all_page:
                all_page = all_page.group(1)
                all_page = int(all_page)
                for page_num in range(2, all_page + 1):
                    page_url = response.url.replace('page=1', 'page={}'.format(page_num))
                    yield Request(page_url, self.parse_fans,
                                  dont_filter=True, meta=response.meta)
        selector = Selector(response)
        urls = selector.xpath('//a[text()="关注他" or text()="关注她" '
                              'or text()="移除"]/@href').extract()
        uids = re.findall('uid=(\d+)', ";".join(urls), re.S)
        ID = re.findall('(\d+)/fans', response.url)[0]
        for uid in uids:
            relationships_item = RelationshipsItem()
            relationships_item['crawl_time'] = int(time.time())
            relationships_item["fan_id"] = uid
            relationships_item["followed_id"] = ID
            relationships_item["_id"] = uid + '-' + ID
            yield relationships_item

    def parse_comment(self, response):
        # 如果是第1页,一次性获取后面的所有页
        # if response.url.endswith('page=1'):
        #     all_page = re.search(r'/>&nbsp;1/(\d+)页</div>', response.text)
        #     if all_page:
        #         all_page = all_page.group(1)
        #         all_page = int(all_page)
        #         for page_num in range(2, all_page + 1):
        #             page_url = response.url.replace('page=1', 'page={}'.format(page_num))
        #             yield Request(page_url, self.parse_comment,
        #                           dont_filter=True, meta=response.meta)
        body = response.body

        body = body.decode("utf-8")
        # print(body)
        response.replace(body=body)
        # print(response.body)
        tree_node = etree.HTML(response.body)
        comment_nodes = tree_node.xpath('//div[@class="c" and contains(@id,"C_")]')
        for comment_node in comment_nodes:
            comment_user_url = comment_node.xpath('.//a[contains(@href,"/u/")]/@href')
            if not comment_user_url:
                continue
            comment_item = CommentItem()
            comment_item['crawl_time'] = int(time.time())
            comment_item['weibo_url'] = response.meta['weibo_url']
            comment_item['comment_user_id'] = \
                re.search(r'/u/(\d+)', comment_user_url[0]).group(1)
            comment_item['content'] = \
                extract_comment_content(etree.tostring(comment_node, encoding='unicode'))
            comment_item['_id'] = comment_node.xpath('./@id')[0]
            created_at_info = comment_node.xpath('.//span[@class="ct"]/text()')[0]
            like_num = comment_node.xpath('.//a[contains(text(),"赞[")]/text()')[-1]
            comment_item['like_num'] = int(re.search('\d+', like_num).group())
            comment_item['created_at'] = time_fix(created_at_info.split('\xa0')[0])
            people_url='https://weibo.cn/'+comment_item['comment_user_id']+'/info'
            yield comment_item
            yield Request(people_url, self.parse_information ,meta={"id":comment_item['comment_user_id']} )

    def parse_head(self, response):
        body = response.body

        body = body.decode("utf-8")
        selector = Selector(text=body)
        head_url=selector.xpath('//img[@alt="头像"]//@src').get()
        item =response.meta
        item['head_url']=head_url
        print(type(item),item)
        yield  item
Exemple #4
0
class QQNewsSpider(scrapy.Spider):
    name = 'qqnews_spider'

    def __init__(self, *args, **kwargs):
        # 与BDsearchUrlUtil交互要用的参数,指明网址
        self.site = 'news.qq.com'

        # 需要在parse()中使用该url关联的'keyword'(自定义item属性),
        # 当然 scrapy.response 对象中是没有的,
        # 也不想改写个 response 对象的子类了,直接定义一个类属性
        self.keyword = ''

        self.bd = BDsearchUrlUtil()
        self.slog = SpiderLogUtil()

        super().__init__(*args, **kwargs)

    def close(self, reason):
        # 当爬虫停止时,调用clockoff()修改数据库
        self.slog.spider_finish(self)
        if self.bd.clockoff(self.site, self.keyword):
            self.logger.info('qqnews_spider clock off successful')

        super().close(self, reason)

    def start_requests(self):
        # get params (from console command) when be started
        self.keyword = getattr(self, 'q', None)

        if self.keyword is None:
            self.keyword = '中美贸易'

        self.slog.spider_start(self)

        # get url list for mongoDB
        urllist = self.bd.getNewUrl(self.site, self.keyword)

        # if no new url or error, urllist=None
        if urllist:
            for url in urllist:
                yield scrapy.Request(url, self.parse)

        # # test news_qq spider
        # url = 'https://news.qq.com/a/20170823/002257.htm'
        # yield scrapy.Request(url, self.parse)

    def parse(self, response):
        item = QQNewsItem()

        # 两排版通用
        item['url'] = response.url
        item['crawl_time'] = getCurrentTime()
        item['title'] = response.xpath('//div[@class=\'hd\']/h1/text()').get()
        item['keyword'] = self.keyword

        # 正文抽取
        content = ''
        for paragraph in response.xpath(
                '//div[@id=\'Cnt-Main-Article-QQ\']/p/text()'):
            paragraph = paragraph.get().strip()
            paragraph = re.sub(r'<[^i].*?>', '', paragraph)
            paragraph = re.sub(r'\(function[\s\S]+?\}\)\(\);', '', paragraph)
            content = content + paragraph
        item['content'] = content

        # 如果有正文,是新闻,没有,不是
        # 关于发布时间和发布来源的布局我快疯了,区区10年变了好多次布局,要一个个定制
        if content:
            # 发布时间
            item['time'] = self.trygetPublishTime(response)
            # 发布来源
            item['source'] = self.trygetPublishSource(response)

            yield item
        else:
            pass

    @staticmethod
    def trygetPublishTime(response):
        time = response.xpath('//span[@class=\'a_time\']/text()').get()
        if not time:
            time = response.xpath(
                '//div[@class=\'hd\']/div[@bosszone=\'titleDown\']'
                '//span[@class=\'article-time\']/text()').get()
        if not time:
            time = response.xpath('//div[@class=\'info\']/text()').get()
        if not time:
            time = response.xpath('//span[@class=\'pubTime\']/text()').get()

        # 如果时间拿到,格式化时间
        # 三种原格式:
        # 2011年07月12日10:33
        # 2011年07月12日 10:33
        # 2017-08-23 06:30
        # 格式化为:
        # 2017-08-23-06-30-00
        if time:
            timefmt = formatTimeStr(time)
            if timefmt:
                return timefmt
            else:
                return time
        else:
            return None

    @staticmethod
    def trygetPublishSource(response):
        source = response.xpath('//span[@class=\'a_source\']/a/text()').get()

        if not source:
            source = response.xpath('//span[@class=\'a_source\']/text()').get()

        if not source:
            source = response.xpath('//span[@class=\'where\']/text()').get()

        if not source:
            source = response.xpath('//span[@class=\'where\']/a/text()').get()

        if not source:
            source = response.xpath(
                '//span[@class=\'color-a-1\']/a/text()').get()

        if not source:
            source = response.xpath(
                '//span[@class=\'color-a-1\']/text()').get()

        return source
class China(Spider):
    name = "chinanews_spider"
    base_url = "http://sou.chinanews.com/search.do"
    custom_settings = {
        'DEFAULT_REQUEST_HEADERS': {
            'User-Agent':
            'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36',
        },
        'CONCURRENT_REQUESTS': 15,
        'DOWNLOAD_DELAY': 2
    }

    def __init__(self, *args, **kwargs):

        self.slog = SpiderLogUtil()

        super().__init__(*args, **kwargs)

    def close(self, reason):

        self.slog.spider_finish(self)
        super().close(self, reason)

    def start_requests(self):

        querystr = getattr(self, 'q', None)
        if not querystr:
            querystr = '中美贸易'
        self.querystr = querystr
        self.q = []
        # folderpath ='E:\chinanews' + querystr
        # if (not os.path.exists(folderpath)):
        #     os.mkdir(folderpath)

        self.slog.spider_start(self)

        my_data = {
            'q': querystr,
            'ps': '10',
            'start': '0',
            'type': '',
            'sort': 'pubtime',
            'time_scope': str(0),
            'channel': 'all',
            'adv': str(1),
            'day1': '',
            'day2': '',
            'field': '',
            'creator': ''
        }
        yield FormRequest(
            formdata=my_data,
            url=self.base_url,
            callback=self.parsefornum,
        )

    def parsefornum(self, response):
        body = response.body
        body = body.decode("utf-8")
        response.replace(body=body)
        pre = re.compile(r'ongetkey\((\d+)\).?>尾页')
        num = pre.findall(str(body))
        num = int(num[0])
        print(num)
        if num > 400:
            num = 400
        for i in range(num):
            q = i * 10
            my_data = {
                'q': self.querystr,
                'ps': '10',
                'start': str(q),
                'type': '',
                'sort': 'pubtime',
                'time_scope': str(0),
                'channel': 'all',
                'adv': str(1),
                'day1': '',
                'day2': '',
                'field': '',
                'creator': ''
            }
            yield FormRequest(
                formdata=my_data,
                url=self.base_url,
                callback=self.parse,
            )

    def parse(self, response):
        body = response.body
        body = body.decode("utf-8")
        response.replace(body=body)
        for div in response.xpath('//td/ul/li[@class="news_title"]/a/@href'):
            url = div.extract()
            yield Request(url=url, callback=self.parse2)

    def parse2(self, response):
        body = response.body

        body = body.decode("utf-8", "ignore")

        response.replace(body=body)
        item = ChinaNewsItem()

        title = response.xpath('//div[@class="content"]/h1/text()').get()
        if title:
            title = title.strip()

            imgs = []
            content = ''
            ire = re.compile(r'src=\"(.+?)\"')
            pre = re.compile(r'<img[\s\S]+?>')
            url = response.url
            for p in response.xpath('//div[@class="left_zw"]').extract():
                p = re.sub(r'<[^i].*?>', '', p)
                p = re.sub(r'\(function[\s\S]+?\}\)\(\);', '', p)
                q = pre.findall(p)
                for i in q:
                    imgs.append(ire.findall(i)[0])
                    p = p.replace(
                        i, '&&此处有图片,url:' + imgs[-1] + ",存储名为:" +
                        (url.split('/')[-1]) + imgs[-1].split('/')[-1] + '&&')
                content = content + p.strip()
            timeandsource = response.xpath(
                '//div[@class="left-t"]/text()').get().strip()
            ts = timeandsource.split('来源')

            item['crawl_time'] = str(int(time.time()))
            created_time = ts[0].strip()
            timeArray = time.strptime(created_time, "%Y年%m月%d日 %H:%M")
            otherStyleTime = time.strftime("%Y-%m-%d-%H-%M-%S", timeArray)
            item['source'] = '中国新闻'
            if len(ts) > 1:
                source = ts[1]
                item['source'] = source

            item['keyword'] = self.querystr
            item['title'] = title
            item['content'] = content.replace("\r", "").replace("\n", "")
            item['time'] = otherStyleTime
            item['url'] = url
            item['imgs'] = imgs
            yield item


# if __name__ == "__main__":
#     process = CrawlerProcess(get_project_settings())
#     process.crawl('chinanews_spider')
#     process.start()
Exemple #6
0
class BaiduSearchSpider(scrapy.Spider):
    name = 'baidu_search_spider'

    def __init__(self, *args, **kwargs):

        self.slog = SpiderLogUtil()

        super().__init__(*args, **kwargs)

    def close(self, reason):

        self.slog.spider_finish(self)
        super().close(self, reason)

    def start_requests(self):
        # get params (from console command) when be started
        querystr = getattr(self, 'q', None)
        site = getattr(self, 'site', None)

        if querystr is None:
            querystr = '中美贸易'
        if site is None:
            site = 'news.qq.com'

        self.slog.spider_start(self)

        url = self.baidusearchurlGen(querystr, site, 0)
        yield scrapy.Request(url, self.parse)

    def parse(self, response):

        # if no result, quit spider
        if response.xpath('//div[@class=\'content_none\']'):
            return

        # ===get info from every result===
        for oneresult in response.xpath('//div[@class=\'result c-container \']\
                                    /h3/a/@href'):
            item = BaiduSearchItem()
            item['url'] = oneresult.get()
            item['crawl_time'] = getCurrentTime()
            item['site'] = self.getOrigSiteUrl(response.url)
            item['waste'] = False
            item['keyword'] = self.getOrigKeyword(response.url)
            yield item

        # ===crawl next page, if exist===

        # pn = page number
        currentpn = response.xpath('//div[@id=\'page\']/\
                        strong/span[@class=\'pc\']/text()')
        if currentpn:
            currentpn = int(currentpn[0].get())

        maxpn = response.xpath('//div[@id=\'page\']/a\
                        /span[@class=\'pc\']/text()')
        if maxpn:
            maxpn = int(maxpn[-1].get())

        nextpn = None

        # if so, exist one page num bigger than current page num
        if (currentpn and maxpn and (maxpn > currentpn)):
            nextpn = currentpn + 1

        if nextpn:
            # get '...&pn=' sub string
            pncharindex = re.search('&pn=', response.url).span()[1]
            nexturl = response.url[:pncharindex] + str((nextpn - 1) * 10)
            yield response.follow(nexturl, self.parse)

    '''
    百度搜索的url的构建函数,以作为种子url列表供爬虫使用
        exm: http://www.baidu.com/s?wd="中美贸易" site%3Anews.qq.com&pn=0
            原查询词: ("中美贸易" site:news.qq.com)
            pn: page number,本页第一条结果在结果排行中的位置。
                pn = ${结果页码-1}*rn
                但rn参数(每页结果条数)已经反应不正常了,
                所以就是默认每页十条。
    
    @ param {string} querystr 查询字符串   exm:中美贸易
    
    @ param {string} site     exm:news.qq.com
                                借用百度的site搜索属性搜索某站内
                                
    @ param {string} pagenumber 本页第一条结果,=pn
    
    @ return {string}         exm: https://www.baidu.com/s?
                                wd="中美贸易" site%3Anews.qq.com&pn=0
    '''

    @staticmethod
    def baidusearchurlGen(querystr, site, pagenumber):
        # 注意https 有一个防爬虫机制,脚本加载真正数据,只能爬个壳。
        return "http://www.baidu.com/s?wd=\"" \
               + querystr + "\" site:" + site + "&pn=" + str(pagenumber)

    '''
    获取搜索时的原站点网址 exm: news.qq.com
    
    @ param {string} resurl response.url 
                            exm: https://www.baidu.com/s?
                                wd="中美贸易" site%3Anews.qq.com&pn=0
                                
    @ return {string}
    '''

    @staticmethod
    def getOrigSiteUrl(resurl):
        # 'site%3A{site domain}&pn'
        index = re.search('site:.*&pn', resurl).span()
        # '{site domain}'
        return resurl[(index[0] + 5):(index[1] - 3)]

    '''
    获取搜索时的关键字 exm: ‘中美贸易’

    @ param {string} resurl response.url 
                            exm: https://www.baidu.com/s?
                                wd=中美贸易 site%3Anews.qq.com&pn=0

    @ return {string}
    '''

    @staticmethod
    def getOrigKeyword(resurl):
        # 's?wd="{keyword}" site'
        index = re.search('wd=%22.*%22%20', resurl).span()
        # '{keyword} in url(ascii for url), decode'
        return urllib.parse.unquote(resurl[(index[0] + 6):(index[1] - 6)])
class SinaNewsSpider(scrapy.Spider):
    name = 'sinanews_spider'

    def __init__(self, *args, **kwargs):
        # 与BDsearchUrlUtil交互要用的参数,指明网址
        self.site = 'news.sina.com.cn'

        # 需要在parse()中使用该url关联的'keyword'(自定义item属性),
        # 当然 scrapy.response 对象中是没有的,
        # 也不想改写个 response 对象的子类了,直接定义一个类属性
        self.keyword = ''

        self.bd = BDsearchUrlUtil()

        self.slog = SpiderLogUtil()

        super().__init__(*args, **kwargs)

    def close(self, reason):
        self.slog.spider_finish(self)

        # 当爬虫停止时,调用clockoff()修改数据库
        if self.bd.clockoff(self.site, self.keyword):
            self.logger.info('sinanews_spider clock off successful')

        super().close(self, reason)

    def start_requests(self):
        # get params (from console command) when be started
        self.keyword = getattr(self, 'q', None)

        if self.keyword is None:
            self.keyword = '中美贸易'

        self.slog.spider_start(self)

        # get url list for mongoDB
        urllist = self.bd.getNewUrl(self.site, self.keyword)

        # if no new url or error, urllist=None
        if urllist:
            for url in urllist:
                yield scrapy.Request(url, self.parse)

        # # test spider
        # url = 'http://news.sina.com.cn/c/2019-06-24' \
        #       '/doc-ihytcitk7355640.shtml'
        # yield scrapy.Request(url, self.parse)

    def parse(self, response):
        item = SinaNewsItem()

        item['url'] = response.url
        item['crawl_time'] = getCurrentTime()
        item['keyword'] = self.keyword

        title = response.xpath('//title/text()').get()
        if title:
            title = title.replace('_新浪新闻', '')
            title = title.replace('_新浪网', '')
            title = title.replace('_新浪军事', '')
            title = title.replace('_新闻中心', '')
        item['title'] = title

        item['time'] = self.trygetPublishTime(response)

        item['source'] = self.trygetPublishSource(response)

        # 正文抽取

        item['content'] = self.trygetContent(response)

        yield item

    @staticmethod
    def trygetPublishTime(response):
        time = response.xpath('//div[@class=\'date-source\']'
                              '/span[@class=\'date\']/text()').get()

        # http://news.sina.com.cn/o/2017-07-07/doc-ifyhvyie0474852.shtml
        # http://mil.news.sina.com.cn/china/2016-04-14/
        # doc-ifxriqqx2384948.shtml
        if not time:
            if response.xpath('//span[@class=\'time-source\']'
                              '//span[@class=\'titer\']'):
                time = response.xpath('//span[@class=\'time-source\']'
                                      '//span[@class=\'titer\']/text()').get()
            else:
                time = response.xpath(
                    '//span[@class=\'time-source\']/text()').get()
                if time:
                    time = re.sub(r'<[^i].*?>', '', time)

        if time:
            timefmt = formatTimeStr(time)
            if timefmt:
                return timefmt
            else:
                return time
        else:
            return None

    @staticmethod
    def trygetPublishSource(response):
        source = response.xpath('//div[@class=\'date-source\']'
                                '/a[@class=\'source\']/text()').get()
        # http://news.sina.com.cn/o/2017-07-07/doc-ifyhvyie0474852.shtml
        if not source:
            source = response.xpath('//div[@class=\'time-source\']'
                                    '//a/text()').get()

        # http://mil.news.sina.com.cn/china/
        # 2016-04-14/doc-ifxriqqx2384948.shtml
        if not source:
            source = response.xpath('//span[@class=\'time-source\']'
                                    '//span[@class=\'source\']'
                                    '/text()').get()

        return source

    @staticmethod
    def trygetContent(response):
        content = ''

        def paragraph_process(paragraph):
            p = paragraph.get().strip()
            p = re.sub(r'<[^i].*?>', '', p)
            p = re.sub(r'\(function[\s\S]+?\}\)\(\);', '', p)
            return p

        # /a/ /c/ doc-... /o/
        if response.xpath('//div[@id=\'article\']//p/text()'):

            for paragraph in response.xpath(
                    '//div[@id=\'article\']//p/text()'):
                content = content + paragraph_process(paragraph)

        # some of /o/
        # http://news.sina.com.cn/o/2019-05-14/doc-ihvhiews1782968.shtml
        elif response.xpath('//div[@id=\'article\']//p/text()'):

            for paragraph in response.xpath(
                    '//div[@id=\'article\']//div/text()'):
                content = content + paragraph_process(paragraph)

        # http://news.sina.com.cn/o/2017-07-07/doc-ifyhvyie0474852.shtml
        elif response.xpath('//div[@id=\'artibody\']//p/text()'):
            for paragraph in response.xpath(
                    '//div[@id=\'artibody\']//p/text()'):
                content = content + paragraph_process(paragraph)

        return content