Ejemplo n.º 1
0
 def get_content(self,response):
     '''获取相关信息'''
     item=WeixinItem()
     item['title']=response.xpath('//div[@class="txt-box"]/h3/a').extract() #一页的全部标题,10条包含html标签
     item['link']=response.xpath('//div[@class="txt-box"]/h3/a/@href').extract() #一页的全部标题链接 10条
     item['dec']=response.xpath('//p[@class="txt-info"]').extract() #一页的全部描述,10条包含html标签
     yield item
Ejemplo n.º 2
0
    def get_content(self, response):
        print(response.text)
        lis = response.xpath('//div[@class="news-box"]//li')
        for li in lis:
            item = WeixinItem()
            s_title = ""
            box_title = li.xpath(
                './/div[@class="txt-box"]/h3/a//text()').extract()
            for tt in box_title:
                s_title = s_title + tt
            s_title = s_title.replace(' ', '').replace('\n', '')

            item['title'] = s_title
            box_url = li.xpath(
                './/div[@class="txt-box"]/h3/a/@href')[0].extract()

            item['link'] = box_url
            s_info = ""
            box_info = li.xpath('.//p[@class="txt-info"]//text()').extract()
            for ii in box_info:
                s_info = s_info + ii
            s_info = s_info.replace(' ', '').replace('\n', '')

            item['dec'] = s_info
            box_s_p = li.xpath(
                './/div[@class="txt-box"]/div[@class="s-p"]/a/text()'
            )[0].extract()
            item['author'] = box_s_p
            print(item)
            yield item
Ejemplo n.º 3
0
    def get_weixin(self, response):
        # print("*****************" * 30)
        weixin = response.xpath('//div[@class="profile_inner"]')
        #print(len(weixin))
        # for wx in weixin:
        # weixinName = weixin.xpath('./strong[@class="profile_nickname"]/text()').extract()[0]
        # # #
        # publicNum = weixin.xpath('.//span[@class="profile_meta_value"]/text()').extract()[0]

        for wx in weixin:
            #print(le)
            weixinName = wx.xpath(
                './strong[@class="profile_nickname"]/text()').extract()[0]

            publicNum = wx.xpath(
                './/span[@class="profile_meta_value"]/text()').extract()[0]

        # articleTitle = response.xpath('//*[@id="activity-name"]/text()').extract()[0].strip()
        #
        # timere = 'publish_time = \"(.*?)\".*'
        # html = response.body.decode('utf-8')
        # # 发布时间
        # postdate = re.findall(timere, html, re.M)[0]
        # # 二维码
        # qrcodere = 'window.sg_qr_code=\"(.*?)\";'
        # qrcode = "https://mp.weixin.qq.com" + re.findall(qrcodere, html)[0].replace('\\x26amp;', '&')

        item = WeixinItem()
        item['publicNum'] = publicNum
        item['weixinName'] = weixinName
        #
        return item
Ejemplo n.º 4
0
    def parse(self, response):
        item = WeixinItem()

        li_list = response.xpath('//*[@id="main"]/div[5]/ul/li')
        for li in li_list:
            item['article_urlx'] = li.xpath(
                './div[2]/h3/a/@href').extract_first()
            item['writer'] = li.xpath('./div[2]/div/a/text()').extract_first()
            item['intro'] = li.xpath(
                './div[2]/div/span/text()[1]').extract_first()
            # 名字根据颜色不同在xpath里应该得拆分成3个
            name_q = li.xpath('./div[2]/h3/a/text()[1]').extract_first()
            name_red = li.xpath('./div[2]/h3/a/em/text()[1]').extract_first()
            name_h = li.xpath('./div[2]/h3/a/text()[2]').extract_first()
            item['article_title'] = str(name_q) + str(name_red) + str(name_h)
            yield scrapy.Request(item['article_urlx'],
                                 callback=self.parse_ziye,
                                 meta=item)

        # 构造翻页,url需要拼接
        url_0 = 'https://weixin.sogou.com/weixin'
        url_1 = response.xpath('//*[@id="sogou_next"]/@href').extract_first()
        if url_1 is not None:
            url_max = url_0 + url_1
            yield scrapy.Request(url_max, callback=self.parse)
        else:
            print('******木有下一页了******')
Ejemplo n.º 5
0
    def parse(self, response):
        print response.body

        for i in response.css("li"):

            item=WeixinItem()
            item['title'] = i.css("h3 a::text").extract_first()
            item["content"]=i.css("p::text").extract_first()
            item['url']=i.css("a::attr(href)").extract_first()
            print item
            yield item
Ejemplo n.º 6
0
 def parse_profile(self, response):
     title = response.xpath('//title/text()').extract()[0].strip()
     create_time = response.xpath('//em[@id="post-date"]/text()').extract()[0].strip()
     source = response.xpath('//a[@id="post-user"]/text()').extract()[0].strip()
     body = response.body.strip()
     tag_content = response.xpath('//div[@id="js_content"]').extract()[0].strip()
     content = remove_tags(tag_content).strip()
     item = WeixinItem()
     item['title'] = title
     item['create_time'] = create_time
     item['source'] = source
     item['body'] = body
     item['content'] = content
     return item
Ejemplo n.º 7
0
    def wenzhang(self, response):
        item = WeixinItem()
        # 文章标题
        contents = response.xpath("//div[@class='bookTitle']")
        NickName = response.xpath("//div[@class='personalSet-c']/span/text()").extract_first()
        item['NickName'] = NickName

        name = response.meta['name']
        item['CateName'] = name
        for content in contents:
            item['MsgTitle'] = content.xpath(".//a/text()").extract_first()
            ReadNum= content.xpath(".//i[1]/text()").extract_first().strip('阅读')
            item['ReadNum'] =ReadNum

            yield item
Ejemplo n.º 8
0
    def parse_page(self, response):

        if response:
            data = json.loads(response.text)
            # 输出data
            # print(data)

            # for app_msg_list in data['list']:
            for app_msg_list in data['app_msg_list']:
                item = WeixinItem()

                #标题
                # item['title'] = app_msg_list['nickname']
                item['title'] = app_msg_list['title']

                #修改时间
                item['update_time'] = app_msg_list['update_time']

                #文章链接
                item['link'] = app_msg_list['link']

                # yield item
                url = app_msg_list['link']

                bbb = []
                data = urllib.request.urlopen(url).read().decode('utf-8')
                content = re.compile('<div class="rich_media_content " id="js_content">.*?</div>', re.DOTALL)
                con = re.findall(content, data)
                cont = re.compile('>(.*?<)', re.DOTALL)
                contents = re.findall(cont, con[0])
                for i in contents:
                    if i != '' and i != '<' and i != '>' and i != '\\r' and i != '\\n' and i != "<'" and i != contents[0]:
                        bbb.append(i)
                item['contents'] = bbb
                time = re.compile('<em id="publish_time" class="rich_media_meta rich_media_meta_text">(.*?)</em>',
                                  re.DOTALL)
                tim = re.findall(time, data)
                item['time'] = tim

                # response = urllib.request.urlopen(url)
                #
                # item['contents'] = response.xpath('//*[@id="js_content"]/p/text()')
                # item['image'] = response.xpath('//*[@id="js_content"]/p/img/@src')

                yield item
Ejemplo n.º 9
0
 def parse_content(self, response):
     """
     解析文章
     """
     title = response.xpath('//*[@id="activity-name"]/text()').extract()[0].encode('utf-8').strip()
     pub_date = response.xpath('//*[@id="post-date"]/text()').extract()[0]
     html = response.xpath('//*[@id="js_content"]').extract()[0]
     content = response.xpath('string(//*[@id="js_content"])').extract_first(default="").strip()
     yield WeixinItem(
         title = title,
         tag = response.meta['tag'],
         url=response.url,
         source = response.meta['source'],
         pub_date = pub_date,
         content = content,
         html = escape_string(html),
         insert_date = datetime.datetime.today().strftime('%Y-%m-%d')
     )
Ejemplo n.º 10
0
 def parse_artic(self, response):
     # print(response.text)
     # print("---------------------")
     # print(response.xpath("//div[@id='js_content']/p/img/@data-sr  c").extract());
     for item in response.xpath(
             "//div[@id='js_content']/p/img/@data-src").extract():
         weixin_item = WeixinItem()
         pic_id = hashlib.md5(item.encode("utf8")).hexdigest()
         result = collection.find_one({"pic_id": pic_id})
         if (result):
             print("continue")
             continue
         else:
             desc = self.getPicDesc()
             weixin_item["pic_id"] = pic_id
             weixin_item["pic_link"] = item
             weixin_item["pic_desc"] = desc
             yield weixin_item
Ejemplo n.º 11
0
    def parse(self, response):
        time.sleep(2)
        
        sel_list = response.xpath("//div[@class='txt-box']")
        for sel in sel_list:
            item = WeixinItem()
            self.get_title(item,sel)
            self.get_link(item,sel)
            self.get_name(item,sel)
            self.get_timestamp(item,sel)
            yield item

        next_link = response.xpath("//a[@id='sogou_next']/@href").extract()
        if next_link:
            next_link = 'http://weixin.sogou.com/weixin' + next_link[0]
            current_page_index = response.xpath("//div[@id='pagebar_container']/span/text()").extract()
            if current_page_index and int(current_page_index[0]) <=3:
                print '-------------------------------'
                print current_page_index[0]
                yield Request(url=next_link, callback=self.parse)
Ejemplo n.º 12
0
    def parse_page(self, response):
        # time.sleep(3)
        selector = Selector(response)

        # for i in range(5):
        content_list = selector.xpath("//div[@class='txt-box']/h3/a")
        # xpath路径如果太复杂,检索效率低下,可能会是的进程中断

        # content_list2 = selector.xpath("//h3[@class='t c-gap-bottom-small']/a")
        # items=[]
        # for i in range(len(content_list)+len(content_list2)):
        #     item = BaiduItem()
        for content in content_list:
            item = WeixinItem()
            url = content.xpath('@href').extract_first()
            title = content.xpath('string(.)').extract_first()
            print url
            print title
            item['title'] = title
            item['url'] = url
            yield item
Ejemplo n.º 13
0
    def parse(self, response):
        items = []
        for each in response.xpath("//div[@class='txt-box']"):
            item = WeixinItem()
            item['name'] = each.xpath("./div/a/text()").extract()
            item['title'] = each.xpath("./h3/a/text()").extract()
            item['sub_urls'] = each.xpath("./h3/a/@href").extract()[0]
            item['keyword'] = self.keyword
            items.append(item)
            self.log(each.xpath("./h3/a/@href").extract())

        #offset自增1
        if (self.offset <= 20):
            self.offset += 1
            # 每次处理完一页的数据后重新发送下一页的页面请求
            # 同时拼接新的url,并调用parse处理response
            yield scrapy.Request(self.urls + str(self.offset),
                                 callback=self.parse)

        for item in items:
            yield scrapy.Request(url=item['sub_urls'],
                                 meta={'meta_1': item},
                                 callback=self.detail_parse)
Ejemplo n.º 14
0
    def parse_page(self, response):

        if response:
            data = json.loads(response.text)

            for app_msg_list in data['app_msg_list']:
                item = WeixinItem()

                #标题
                item['title'] = app_msg_list['title']

                #修改时间
                item['update_time'] = app_msg_list['update_time']

                #文章链接
                item['link'] = app_msg_list['link']

                #获取响应参数
                o = urlparse(item['link'])
                query = parse_qs(o.query)
                sn = query['sn'][0]
                idx = query['idx'][0]
                mid = query['mid'][0]

                # 目标url
                url = "http://mp.weixin.qq.com/mp/getappmsgext"

                # 添加Cookie避免登陆操作,这里的"User-Agent"最好为手机浏览器的标识
                headers = {
                    "Host":
                    "mp.weixin.qq.com",
                    "Connection":
                    "keep-alive",
                    "Content-Length":
                    "760",
                    "Accept":
                    "*/*",
                    "CSP":
                    "active",
                    "Content-Type":
                    "application/x-www-form-urlencoded; charset=UTF-8",
                    "Origin":
                    "https://mp.weixin.qq.com",
                    "Cookie":
                    "",  #你的cookie,
                    "User-Agent":
                    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36 MicroMessenger/6.5.2.501 NetType/WIFI WindowsWechat QBCore/3.43.27.400 QQBrowser/9.0.2524.400"
                }

                # post提交的参数
                data = {
                    "__biz": "MzUzNTcwNDkxNA==",
                    "is_only_read": "1",
                    "req_id": "22219NGqdgu78GNZHiJk7Dqr",  #可变
                    "pass_ticket":
                    'BNKM3eeVl4pVB61oYdfUKMfAB54LcsYnT2dupvcSrryRe5JIfvBfdwYi8Q1oOr%252F1',  #可变
                    "is_temp_url": "0",
                }

                # url参数使用 params
                params = {
                    "mid":
                    mid,
                    "sn":
                    sn,
                    "idx":
                    idx,
                    "key":
                    "11ebb00d5fe818881fc4d26ab284b0eee3a515b9d95ebab2be9f748ea37259999ed3b114d76e77fabd3e79facdc2b8fde86e0df810f354d1cef85366b9efc0e6a09eda7bd03840b5ba9bc2a4e7303932",
                    "pass_ticket":
                    "BNKM3eeVl4pVB61oYdfUKMfAB54LcsYnT2dupvcSrryRe5JIfvBfdwYi8Q1oOr%252F1",  #可变
                    "appmsg_token":
                    "953_ucEzgSDLUI%2BPyWTRF9DLSYlVmTw0uEzaTcj4VgOyE_82wFzJhD7meE2JYiTP7iEzCC_gzRAZJMTSA-h1",  #可变
                }

                #使用post方法进行提交
                content = requests.post(url,
                                        headers=headers,
                                        data=data,
                                        params=params).json()

                #阅读
                item['read_num'] = content["appmsgstat"]["read_num"]

                #点赞
                item['like_num'] = content["appmsgstat"]["like_num"]

                yield item
Ejemplo n.º 15
0
 def parse(self, response):
     item = WeixinItem()
     item['title'] = response.xpath('//h3/a/text()').extract()
     item['abstract'] = response.xpath('//p/text()').extract()
     return item