Beispiel #1
0
 def parse(self, response):
     # print(response.text)
     # 5 & sub_ch=mobile&id=SI_Cont&page=1&num=20
     self.db = BLDBHelper()
     baseurl = "http://api.slide.news.sina.com.cn/interface/api_album.php?activity_size=198_132&size=img&ch_id="
     baseurl1 = "&id=SI_Cont&num=20&page=1"
     a = self.db.selectenameforxltp()
     # b=str(a[1][0]).split("&")[0]
     # c=str(a[1][0]).split("&")[1]
     # print(a[1][0])
     # print(type(a[1][0]))
     # contenturl = baseurl + b + "&sub_ch=" + c + baseurl1
     # print(contenturl)
     # print(b)
     # print(c)
     j = 0
     for i in a:
         ch_id = a[j][0]
         b = str(ch_id).split("&")[0]
         c = str(ch_id).split("&")[1]
         j = j + 1
         contenturl = baseurl + b + "&sub_ch=" + c + baseurl1
         # print(contentid)
         # print(date)
         print(contenturl)
         yield scrapy.Request(url=contenturl,
                              callback=self.parse_content_url)
Beispiel #2
0
 def parse(self, response):
     self.db = BLDBHelper()
     a = self.db.selectenamebyvideo()
     print(a[1][0])
     print(type(a[1][0]))
     j = 0
     baseurl = "https://haokan.baidu.com/videoui/api/videorec?tab="
     baseurl2 = "&act=pcFeed&pd=pc&num=20&shuaxin_id=1587787443537"
     for i in a:
         tyasf = a[j][0]
         print(a[j][0])
         j = j + 1
         yield scrapy.Request(url=baseurl + str(a[j][0]) + baseurl2,
                              callback=self.parse_format2_url)
 def parse(self, response):
     self.db = BLDBHelper()
     a = self.db.selectreaderid()
     print(a[1][0])
     j = 0
     for i in a:
         readerid = a[j][0]
         print(readerid)
         item = readertypeItem()
         item["readertype"] = "默认"
         item["readerid"]= int(readerid)
         item["level"] = "1"
         item["priority"] = "A"
         j = j + 1
         yield item
Beispiel #4
0
 def file_path(self, request, response=None, info=None):
     image_name = request.meta['item']['name']
     videooriginurl =request.url
     self.db = BLDBHelper()
     videoidt = self.db.selectVideobyurl(videooriginurl)
     videoidI=videoidt[0][0]
     videoid= str(videoidI)
     print(videoidI)
     print(type(videoidI))
     mp4='.mp4'
     print(type(mp4))
     videoname=videoid+mp4
     print(videoname)
     path = image_name + '/'+videoname
     print(path)
     print("!!!!!!!!!!!!!!!!!!!!!!!!")
     return path
Beispiel #5
0
 def parse(self, response):
     # print(response.text)
     self.db = BLDBHelper()
     a = self.db.selectnewsurl()
     # print(a[1][0])
     # print(type(a[1][0]))
     j = 0
     for i in a:
         contentid = a[j][0]
         date = contentid[0:8]
         j = j + 1
         contenturl = "https://new.qq.com/omn/" + date + "/" + contentid + ".html"
         # print(contentid)
         # print(date)
         # print(contenturl)
         yield scrapy.Request(url=contenturl,
                              callback=self.parse_content_url)
Beispiel #6
0
 def parse(self, response):
     # print(response.text)
     self.db = BLDBHelper()
     a=self.db.selectnewsename()
     print(a[1][0])
     print(type(a[1][0]))
     j=0
     for i in a:
         self.tyasf= a[j][0]
         print( self.tyasf)
         # cid = self.db.selectcidforename(self.tyasf)
         # print(cid[0][0])
         # print(type(cid[0][0]))
         j=j+1
         self.page_firsturl = "https://pacaio.match.qq.com/irs/rcd?cid=146&token=" + self.token + "&ext=" + self.tyasf + "&page=0"
         print(self.page_firsturl)
         yield scrapy.Request(url=self.page_firsturl, callback=self.parse_page_url)
    def parse(self, response):
        # print(response.text)
        self.db = BLDBHelper()
        a = self.db.selectpicsurl()
        print(a[1][0])
        print(type(a[1][0]))
        j = 0
        for i in a:
            contenturl = a[j][0]
            # date=contentid[0:8]
            j = j + 1

            # print(contentid)
            # print(date)
            # print(contenturl)
            yield scrapy.Request(url=contenturl + "?vt=4&hd=1",
                                 callback=self.parse_content_url)
class InsertBysjDataSpider(Spider):
    # 爬虫名称
    name = 'InsertBysjData'
    # 起始url地址
    start_urls = ['http://baidu.com/']

    custom_settings = {
        'CONCURRENT_REQUESTS': 16,
        'DOWNLOAD_DELAY': 0,
        'COOKIES_ENABLED': False,
        'RETRY_TIMES': 15,
         'DEFAULT_REQUEST_HEADERS': {
            'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
            'accept-encoding': 'gzip, deflate, br',
            'accept-language': 'zh-CN,zh;q=0.9,en;q=0.8',
            'cache-control': 'max-age=0',
        },

    }

    # # 处理 start_urls 对应的响应
    # def parse(self, response):
    #     for i in range(60):
    #         item = readerItem()
    #         item1= userItem()
    #         item1["nick"]="匿名用户" + str(i)
    #         item1["password"]="******"
    #         item1["head"] = "https://xiaoliwaer.top:525/headerimg/temp.jpg"
    #         item1["usertype"] = "r"
    #         item["nick"] = "匿名用户" + str(i)
    #         item["birthday"] = "1998-01-25"
    #         item["telephone"] = "13656252033"
    #         item["sex"]= "男"
    #         item["work"]= "程序员"
    #         item["likeread"] = "默认"
    #         item["city"] = "广东东莞"
    #         item["intro"] = "无"
    #         print(str(i))
    #         yield item1
    #         yield item

    def parse(self, response):
        self.db = BLDBHelper()
        a = self.db.selectreaderid()
        print(a[1][0])
        j = 0
        for i in a:
            readerid = a[j][0]
            print(readerid)
            item = readertypeItem()
            item["readertype"] = "默认"
            item["readerid"]= int(readerid)
            item["level"] = "1"
            item["priority"] = "A"
            j = j + 1
            yield item
class HaokanspSpider(Spider):
    # 爬虫名称
    name = 'videodownload'
    # 起始url地址
    start_urls = ['https://baidu.com']

    custom_settings = {
        'CONCURRENT_REQUESTS': 64,
        'DOWNLOAD_DELAY': 0,
        'COOKIES_ENABLED': False,
        'RETRY_TIMES': 15,
        'DEFAULT_REQUEST_HEADERS': {
            'accept':
            'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
            'accept-encoding': 'gzip, deflate, br',
            'accept-language': 'zh-CN,zh;q=0.9,en;q=0.8',
            'cache-control': 'max-age=0',
        },
    }

    def parse(self, response):
        self.db = BLDBHelper()
        a = self.db.selectenamebyvideo()
        print(a[1][0])
        print(type(a[1][0]))
        ename = a[7][0]
        cidt = self.db.selectcidforename(ename)
        cid = cidt[0]
        print(cid)
        b = self.db.selectVideo(cid)
        j = 0
        file_urls = []
        item = ExamplesItem()
        for i in b:
            url = b[j][0]
            print(b[j][0])
            file = b[j][0]
            j = j + 1
            file_urls.append(file)
        item["file_urls"] = file_urls
        item["name"] = ename
        yield item
 def parse(self, response):
     self.db = BLDBHelper()
     a = self.db.selectenamebyvideo()
     print(a[1][0])
     print(type(a[1][0]))
     ename = a[7][0]
     cidt = self.db.selectcidforename(ename)
     cid = cidt[0]
     print(cid)
     b = self.db.selectVideo(cid)
     j = 0
     file_urls = []
     item = ExamplesItem()
     for i in b:
         url = b[j][0]
         print(b[j][0])
         file = b[j][0]
         j = j + 1
         file_urls.append(file)
     item["file_urls"] = file_urls
     item["name"] = ename
     yield item
Beispiel #11
0
class VideoPipeline(FilesPipeline):
    def get_media_requests(self, item, info):
        # 处理对象:每组item中的每张图片
        for video_url in item['file_urls']:
            yield Request(video_url, meta={'item': item})

    def file_path(self, request, response=None, info=None):
        image_name = request.meta['item']['name']
        videooriginurl =request.url
        self.db = BLDBHelper()
        videoidt = self.db.selectVideobyurl(videooriginurl)
        videoidI=videoidt[0][0]
        videoid= str(videoidI)
        print(videoidI)
        print(type(videoidI))
        mp4='.mp4'
        print(type(mp4))
        videoname=videoid+mp4
        print(videoname)
        path = image_name + '/'+videoname
        print(path)
        print("!!!!!!!!!!!!!!!!!!!!!!!!")
        return path
Beispiel #12
0
class NewscontentSpider(Spider):
    # 爬虫名称
    p = ""
    name = 'newscontent'
    # 起始url地址
    start_urls = ['https://new.qq.com/omn/20200325/20200325A0LQ9J00.html']

    page_num = "0"

    custom_settings = {
        'CONCURRENT_REQUESTS': 64,
        'DOWNLOAD_DELAY': 0,
        'COOKIES_ENABLED': False,
        'RETRY_TIMES': 15,
        'DEFAULT_REQUEST_HEADERS': {
            'accept':
            'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
            'accept-encoding': 'gzip, deflate, br',
            'accept-language': 'zh-CN,zh;q=0.9,en;q=0.8',
            'cache-control': 'max-age=0',
        },
    }

    # 处理 start_urls 对应的响应
    def parse(self, response):
        # print(response.text)
        self.db = BLDBHelper()
        a = self.db.selectnewsurl()
        # print(a[1][0])
        # print(type(a[1][0]))
        j = 0
        for i in a:
            contentid = a[j][0]
            date = contentid[0:8]
            j = j + 1
            contenturl = "https://new.qq.com/omn/" + date + "/" + contentid + ".html"
            # print(contentid)
            # print(date)
            # print(contenturl)
            yield scrapy.Request(url=contenturl,
                                 callback=self.parse_content_url)

            #yield scrapy.Request(url='https://new.qq.com/omn/20200325/20200325A0LQ9J00.html', callback=self.parse_content_url)

    def parse_content_url(self, response):
        # print(response.text)
        body = response.body.decode('GBK')
        newcontentp = Selector(
            text=body).xpath('//div[@class="content-article"]/p').extract()
        newcontenth1 = Selector(
            text=body).xpath('//div[@class="LEFT"]/h1[1]/text()').extract()
        # print(newcontentp)
        # print(type(newcontentp))

        temh1 = self.p.join(item for item in newcontenth1)
        temp = self.p.join(item for item in newcontentp)
        # print(temp)
        temp1 = temp.replace("//", "https://")
        #print(temp1)
        # temp1=temp1.join(temp)
        # content=temp1+temp
        # for item in newcontentp:
        #       print(item)
        #       self.p= item.join(self.p)
        #       print(type(item))
        # print(content)
        #
        # for item in newcontenth1:
        #       print(item)
        #       print(type(item))

        item = newscontentItem()
        url = response.url
        urltemp = url.split('/')[-1]
        item["newsurl"] = urltemp.split('.')[0]
        print(item["newsurl"])
        print(type(temp1))
        item["newscontent"] = temp1
        # item["newstitle"]=temh1
        yield item
Beispiel #13
0
class HaokanspSpider(Spider):
    # 爬虫名称
    name = 'haokansp'
    # 起始url地址
    start_urls = ['https://baidu.com']

    custom_settings = {
        'CONCURRENT_REQUESTS': 64,
        'DOWNLOAD_DELAY': 0,
        'COOKIES_ENABLED': False,
        'RETRY_TIMES': 15,
        'DEFAULT_REQUEST_HEADERS': {
            'accept':
            'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
            'accept-encoding': 'gzip, deflate, br',
            'accept-language': 'zh-CN,zh;q=0.9,en;q=0.8',
            'cache-control': 'max-age=0',
        },
    }

    # 处理 start_urls 对应的响应
    def parse(self, response):
        self.db = BLDBHelper()
        a = self.db.selectenamebyvideo()
        print(a[1][0])
        print(type(a[1][0]))
        j = 0
        baseurl = "https://haokan.baidu.com/videoui/api/videorec?tab="
        baseurl2 = "&act=pcFeed&pd=pc&num=20&shuaxin_id=1587787443537"
        for i in a:
            tyasf = a[j][0]
            print(a[j][0])
            j = j + 1
            yield scrapy.Request(url=baseurl + str(a[j][0]) + baseurl2,
                                 callback=self.parse_format2_url)
        # yield scrapy.Request(url=baseurl + "junshi" + baseurl2, callback=self.parse_format2_url)

    def parse_format2_url(self, response):
        # print(response.text)
        haokanspurllist_dict = json.loads(response.text)
        ename = response.url[50:-49]
        cid = self.db.selectcidforename(ename)

        if haokanspurllist_dict.get("errno") == 0:
            data = haokanspurllist_dict.get('data')
            print(data)
            for name, info in haokanspurllist_dict.items():
                print(name)
                if name == "data":
                    print("****一如果一级元素名为data*****")
                    for key, value in info.items():
                        print("****打印data下一级的键值对(二级)*****")
                        print(key, ':', value)
                        if key == "response":
                            for ikey, ivalue in value.items():
                                print("!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!")
                                print(ikey, ':', ivalue)

                                item = VideoItem()
                                item["videoorigin"] = "hksp"
                                item["cid"] = str(cid[0][0])
                                item["videoctime"] = datetime.datetime.now()
                                for j in ivalue:
                                    for jkey, jvalue in j.items():
                                        print("****打印好看视频页面列表的视频项内容*****")
                                        print(jkey, ':', jvalue)
                                        if jkey == "title":
                                            item["videoname"] = jvalue
                                        if jkey == "poster":
                                            item["videoheadurl"] = jvalue
                                        if jkey == "url":
                                            item[
                                                "videooriginurl"] = jvalue + "&"
                                        if jkey == "source_name":
                                            item["videokeyword"] = jvalue
                                    yield item
Beispiel #14
0
class NewsnbalistSpider(Spider):
    # 爬虫名称
    name = 'newslist'
    # 起始url地址
    start_urls = ['https://www.qq.com']
    # page_firsturl = 'https://pacaio.match.qq.com/vlike/category?cid=1&num=20&page=0'
    # page_baseurl = 'https://pacaio.match.qq.com/vlike/category?cid=1&num=20&page='
    page_num="0"
    token="49cbb2154853ef1a74ff4e53723372ce"

    custom_settings = {
        'CONCURRENT_REQUESTS': 64,
        'DOWNLOAD_DELAY': 0,
        'COOKIES_ENABLED': False,
        'RETRY_TIMES': 15,
        'DEFAULT_REQUEST_HEADERS': {
            'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
            'accept-encoding': 'gzip, deflate, br',
            'accept-language': 'zh-CN,zh;q=0.9,en;q=0.8',
            'cache-control': 'no-cache',
            'User-Agent': 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36',

        },
    }

    # 处理 start_urls 对应的响应
    def parse(self, response):
        # print(response.text)
        self.db = BLDBHelper()
        a=self.db.selectnewsename()
        print(a[1][0])
        print(type(a[1][0]))
        j=0
        for i in a:
            self.tyasf= a[j][0]
            print( self.tyasf)
            # cid = self.db.selectcidforename(self.tyasf)
            # print(cid[0][0])
            # print(type(cid[0][0]))
            j=j+1
            self.page_firsturl = "https://pacaio.match.qq.com/irs/rcd?cid=146&token=" + self.token + "&ext=" + self.tyasf + "&page=0"
            print(self.page_firsturl)
            yield scrapy.Request(url=self.page_firsturl, callback=self.parse_page_url)

    def parse_page_url(self, response):
        # print(response.text)
        strename = response.url[87:]
        strename1 = strename[:-7]
        pagenum=strename[-1:]
        print(pagenum)
        # print(strename1)
        cid = self.db.selectcidforename(strename1)
        urllist_dict = json.loads(response.text)
        # print(cid)
        if urllist_dict.get("datanum") > 0:
            data = urllist_dict.get('data')
            for i in data:
                print("txxwpageurl:" + i["app_id"])
                print("pagetitle:" + i["title"])
                print("cid:" + i["category"])
                print("keywords:" + i["keywords"])
                print("keywords:" + i["img"])

                item = NewsItem()
                item["newsoriginurl"] = i["app_id"]
                item["newstitle"] = i["title"]
                item["cid"] = cid[0][0]
                item["newsorigin"] = "txxw"
                item["newskeyword"] = i["keywords"]
                item["newsheadurl"] = i["img"]
                item["newsctime"]=datetime.datetime.now()
                yield item

        for i in range(4,8):
            self.page_num=str(i)
            page_url = "https://pacaio.match.qq.com/irs/rcd?cid=146&token=" + self.token + "&ext=" + strename1 + "&page="+self.page_num
            print(page_url)
            yield scrapy.Request(url=page_url, callback=self.parse_page_url)
Beispiel #15
0
class NewscontentSpider(Spider):
    # 爬虫名称
    p = ""
    name = 'picscontent'
    # 起始url地址
    start_urls = ['https://photo.sina.cn/']

    page_num = "0"

    custom_settings = {
        'CONCURRENT_REQUESTS': 64,
        'DOWNLOAD_DELAY': 0,
        'COOKIES_ENABLED': False,
        'RETRY_TIMES': 15,
        'DEFAULT_REQUEST_HEADERS': {
            'accept':
            'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
            'accept-encoding': 'gzip, deflate, br',
            'accept-language': 'zh-CN,zh;q=0.9,en;q=0.8',
            'cache-control': 'max-age=0',
        },
    }

    # 处理 start_urls 对应的响应
    def parse(self, response):
        # print(response.text)
        self.db = BLDBHelper()
        a = self.db.selectpicsurl()
        print(a[1][0])
        print(type(a[1][0]))
        j = 0
        for i in a:
            contenturl = a[j][0]
            # date=contentid[0:8]
            j = j + 1

            # print(contentid)
            # print(date)
            # print(contenturl)
            yield scrapy.Request(url=contenturl + "?vt=4&hd=1",
                                 callback=self.parse_content_url)
        # yield scrapy.Request(url="https://photo.sina.cn/album_24_86328_133303.htm", callback=self.parse_content_url)

    def parse_content_url(self, response):
        print(response.text)
        # body = response.body.decode('GBK')
        newcontentp = Selector(text=response.text).xpath(
            '//section[@class="section-item"]').extract()
        newcontenth1 = Selector(
            text=response.text).xpath('//h1[1]/text()').extract()
        print(newcontentp)
        print(type(newcontentp))

        temh1 = self.p.join(item for item in newcontenth1)
        temp = self.p.join(item for item in newcontentp)
        print(temp)
        print(type(temp))

        temp2 = temp.replace('src="data:', 'id="')
        temp1 = temp2.replace("data-src", "src")

        print(temp2)
        item = picscontentItem()
        tempurl = response.url
        item["picsurl"] = tempurl.split('?')[0]
        item["picscontent"] = temp1
        yield item
Beispiel #16
0
class XltpSpider(Spider):
    # 爬虫名称
    name = 'xltp'
    # 起始url地址
    start_urls = ['http://slide.ent.sina.com.cn/']

    page_num = "0"

    custom_settings = {
        'CONCURRENT_REQUESTS': 16,
        'DOWNLOAD_DELAY': 0,
        'COOKIES_ENABLED': False,
        'RETRY_TIMES': 15,
        'DEFAULT_REQUEST_HEADERS': {
            'accept':
            'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
            'accept-encoding': 'gzip, deflate, br',
            'accept-language': 'zh-CN,zh;q=0.9,en;q=0.8',
            'cache-control': 'max-age=0',
        },
    }

    # 处理 start_urls 对应的响应
    def parse(self, response):
        # print(response.text)
        # 5 & sub_ch=mobile&id=SI_Cont&page=1&num=20
        self.db = BLDBHelper()
        baseurl = "http://api.slide.news.sina.com.cn/interface/api_album.php?activity_size=198_132&size=img&ch_id="
        baseurl1 = "&id=SI_Cont&num=20&page=1"
        a = self.db.selectenameforxltp()
        # b=str(a[1][0]).split("&")[0]
        # c=str(a[1][0]).split("&")[1]
        # print(a[1][0])
        # print(type(a[1][0]))
        # contenturl = baseurl + b + "&sub_ch=" + c + baseurl1
        # print(contenturl)
        # print(b)
        # print(c)
        j = 0
        for i in a:
            ch_id = a[j][0]
            b = str(ch_id).split("&")[0]
            c = str(ch_id).split("&")[1]
            j = j + 1
            contenturl = baseurl + b + "&sub_ch=" + c + baseurl1
            # print(contentid)
            # print(date)
            print(contenturl)
            yield scrapy.Request(url=contenturl,
                                 callback=self.parse_content_url)
        # cid = self.db.selectcidforchname("海军力量")
        # print(cid[0][0])
        # print(type(cid[0][0]))

        # yield scrapy.Request(url=contenturl, callback=self.parse_content_url)

    def parse_content_url(self, response):
        print(response.text)
        page = response.url[-1]
        # body = response.body.decode('GBK')
        xltp_dict = json.loads(response.text)
        if xltp_dict.get("count") == "20":
            data = xltp_dict.get('data')
            for i in data:
                print("!!!!!!!!!!!!!!!!!!!!!!!")
                print(i)
                item = PicsItem()
                item["picsorigin"] = "xltp"
                print(item["picsorigin"])
                item["picsctime"] = datetime.datetime.now()
                for jkey, jvalue in i.items():
                    print("&&&&&&&&&&&&&&&&")
                    if jkey == "name":
                        item["picsname"] = jvalue
                        print(item["picsname"])
                    if jkey == "img_url":
                        item["picsheadurl"] = jvalue
                        print(item["picsheadurl"])
                    if jkey == "url":
                        # item["picsurl"] = jvalue
                        strtemp = str(jvalue).split("/")[4]
                        picsurl = "https://photo.sina.cn/" + strtemp.replace(
                            "slide", "album")
                        # print(picsurl)
                        # print(str(jvalue).split("/")[4])
                        # print( item["picsurl"])
                        item["picsoriginurl"] = picsurl

                    if jkey == "short_name":
                        item["picskeyword"] = jvalue
                        print(item["picskeyword"])
                    if jkey == "sub_ch":
                        cid = self.db.selectcidforchname(str(jvalue))
                        item["cid"] = cid[0][0]
                    yield item