Beispiel #1
0
 def parseDomVideo(self, base, url):
     try:
         soup = self.fetchUrlWithBase(base + url, header)
         div = soup.first("div", {'class': 'players'})
         if div != None:
             script = div.first('script')
             if script != None:
                 text = unquote(
                     script.text.replace("\"", "").replace("\/", "/"))
                 texts = text.split(",")
                 for item in texts:
                     match = regVideo.search(item)
                     if match != None:
                         videoUrl = match.group(1)
                         return "%s%s%s" % ("http", videoUrl, 'm3u8')
         print '没找到mp4'
         return None
     except Exception as e:
         print common.format_exception(e)
         return None
Beispiel #2
0
    def fetchUrl(self, url):
        count = 0
        while count < maxCount:
            try:
                header = {
                    'User-Agent':
                    'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Safari/537.36',
                    "Referer": url
                }
                req = urllib2.Request(url, headers=header)
                content = urllib2.urlopen(req, timeout=3000).read()
                soup = BeautifulSoup(str(content))
                return soup
            except Exception as e:
                print common.format_exception(e)
                print '打开页面错误,重试', url, '次数', count
                count = count + 1

        print '打开页面错误,重试3次还是错误', url
        return BeautifulSoup('')
Beispiel #3
0
 def fetchImgItemsData(self, url, channel):
     try:
         trs = self.fetchDataHead(url)
         print url, ";itemsLen=", len(trs)
         objs = []
         sortType = dateutil.y_m_d()
         for item in trs:
             ahrefs = item.findAll("a")
             if ahrefs == None:
                 continue
             for ahref in ahrefs:
                 match = img_channel_title.search(ahref.text)
                 if match == None:
                     continue
                 obj = {}
                 match = img_channel_date.search(ahref.text)
                 if match != None:
                     obj['fileDate'] = match.group(0)
                 else:
                     obj['fileDate'] = ''
                 name = ahref.text.replace(obj['fileDate'], '')
                 obj['name'] = name
                 obj['url'] = ahref.get('href')
                 obj['baseurl'] = baseurl
                 obj['channel'] = channel
                 obj['updateTime'] = datetime.datetime.now()
                 pics = self.fetchImgs(ahref.get('href'))
                 if len(pics) == 0:
                     print '没有 图片文件--', ahref, '---', url
                     continue
                 obj['picList'] = pics
                 obj['showType'] = 3
                 obj['pics'] = len(pics)
                 obj['sortType'] = sortType
                 obj['showType'] = 3
                 print 'url=', obj['url'], 'filedate=', obj[
                     'fileDate'], '  图片数量=', len(pics)
                 objs.append(obj)
         return objs
     except Exception as e:
         print common.format_exception(e)
Beispiel #4
0
 def parseDomVideo(self, url):
     header = {
         'User-Agent':
         'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Safari/537.36',
         "Referer": url
     }
     try:
         match = regVideoid.search(url)
         if match != None:
             id = match.group(1)
             vurl = "/vod/%s/play.htm?%s-0-1" % (id, id)
             soup = self.fetchUrl(vurl)
             play_video = soup.first('div', {'class': 'body mab5 pat5'})
             if play_video != None:
                 script = play_video.first('script')
                 if script != None:
                     content = unquote(str(script.text))
                     match = regVideo.search(content)
                     if match != None:
                         obj = json.loads(match.group(1))
                         data = obj.get('Data', [])
                         urlData = []
                         for item in data:
                             itemData = item.get('playurls', [])
                             for itemUrl in itemData:
                                 for itemurlOne in itemUrl:
                                     if itemurlOne.count('http') > 0:
                                         urlData.append(itemurlOne)
                         for item in urlData:
                             if item.count('m3u8'):
                                 return item
                         for item in urlData:
                             if item.count('/share/'):
                                 return item
                         if len(urlData) > 0:
                             return urlData[0]
         print '没找到mp4'
         return None
     except Exception as e:
         print common.format_exception(e)
         return None
Beispiel #5
0
    def run(self):

        try:
            dbVPN = db.DbVPN()
            ops = db_ops.DbOps(dbVPN)
            sortType = dateutil.y_m_d()
            #             sortType = "2017-07-12"
            for i in range(0, 20000):
                #                 ret = ops.getTextChannelItems(self.t_item["url"], i)
                ret = ops.getTextChannelItemsById(i, sortType)
                if len(ret) == 0:
                    print '写入完毕'
                    break
                print '开始写入 channel :', self.t_item["url"],
                cloase = False
                for item in ret:
                    #                     path = filePATH + str(item['id']) + ".txt"
                    #                     if os.path.exists(path) == False:
                    #                         output = open(path, 'w')
                    #                         output.write(item['file'])
                    #                         output.close()
                    #                         print '写完文件:' + path
                    #                     path = filePATHWeb + str(item['id']) + ".txt"
                    #                     if os.path.exists(path) == False:
                    #                         output = open(path, 'w')
                    #                         output.write(html_parse.filter_tags(item['file']))
                    #                         output.close()
                    #                         print '写完文件:' + path
                    path = filePATHHtml + str(item['id']) + ".html"
                    #                     if os.path.exists(path) == False:
                    output = open(path, 'w')
                    output.write(
                        html_parse.txtToHtml(
                            html_parse.filter_tags(item['file'])))
                    output.close()
                    print '写完文件:' + path
                print '写完页', i
            print 'channel :', self.t_item["url"], '同步完成 len=', len(ret)
            dbVPN.close()
        except Exception as e:
            print common.format_exception(e)
Beispiel #6
0
 def parseDomVideo(self, url):
     try:
         soup = self.fetchUrl(url)
         divs = soup.findAll("ul", {'class': 'play-list play-list-long'})
         for div in divs:
             ahref = div.first("a")
             soup = self.fetchContentUrl(ahref.get("href"))
             #                 scripts = soup.findAll('script')
             #                 scripts.reverse()
             #                 for script in scripts:
             #                     text = unquote(script.text.replace("\"","").replace("\/","/"))
             #                     print text
             match = regVideo.search(soup)
             if match != None:
                 videoUrl = match.group(1)
                 return "%s%s%s" % ("http", videoUrl, 'm3u8')
         print '没找到mp4'
         return None
     except Exception as e:
         print common.format_exception(e)
         return None
Beispiel #7
0
    def parseDomVideo(self, url):

        try:
            soup = self.fetchUrl(url)
            iframe = soup.first("iframe")
            if iframe != None:
                ahref = iframe.get("src")
                if ahref != None:
                    soup = self.fetchUrl(ahref)
                    scripts = soup.findAll("script")
                    for script in scripts:
                        if script.text != None:
                            content = unquote(str(script.text))
                            match = regVideo.search(content)
                            if match != None:
                                return "http" + match.group(1) + 'm3u8'
            print '没找到mp4'
            return None
        except Exception as e:
            print common.format_exception(e)
            return None
Beispiel #8
0
    def parseDomVideo(self, url):

        try:
            soup = self.fetchUrl(url)
            div = soup.first("div", {"class": "vodplaybox"})
            if div != None:
                aherfs = div.findAll("a")
                if len(aherfs) > 0:
                    aherf = aherfs[len(aherfs) - 1]
                    if aherf != None:
                        content = self.fetchContentUrlWithBase(
                            aherf.get('href'))
                        content = unquote(str(content))
                        match = regVideo.search(content)
                        if match != None:
                            return 'http' + match.group(1)
            print '没找到mp4'
            return None
        except Exception as e:
            print common.format_exception(e)
            return None
Beispiel #9
0
    def fetchUrl(self, url, aheader=h_headers):
        count = 0
        while count < maxCount:
            try:
                req = urllib2.Request(baseurl + url, headers=h_headers)
                response = urllib2.urlopen(req, timeout=300)
                gzipped = response.headers.get(
                    'Content-Encoding')  # 查看是否服务器是否支持gzip
                content = response.read().decode('utf8', errors='replace')
                if gzipped:
                    content = zlib.decompress(content, 16 +
                                              zlib.MAX_WBITS)  # 解压缩,得到网页源码
                soup = BeautifulSoup(content)
                return soup
            except Exception as e:
                print common.format_exception(e)
                print '打开页面错误,重试', baseurl + url, '次数', count
                count = count + 1

        print '打开页面错误,重试3次还是错误', url
        return BeautifulSoup('')
Beispiel #10
0
 def parseDomVideo(self, url):
     try:
         soup = self.fetchUrl(url, header)
         div = soup.first('div',{"class":"stab_list"})
         if div!=None:
             ahref = div.first('a')
             if ahref!=None:
                 soup = self.fetchUrl(ahref.get('href'), header)
                 player = soup.first('div',{"class":"player"})
                 if player!=None:
                     content = unquote(str(player.text)).split("$")
                     for item in content:
                         match = regVideo.search(item)
                         if match!=None: 
                             return "http"+match.group(1)+'.m3u8'
                 
         print url,'没有mp4'
         return None
     except Exception as e:
         print common.format_exception(e)
         return None
Beispiel #11
0
    def execute(self, query, args=None):
        try:
            if args != None and isinstance(args, list):
                typeutil.listReplace(args, None, -1)
            if query.count('%s') != 0 and query.count('%s') != len(args):
                print 'error: sql error[%s][%s]' % (query, args)
                raise ValueError('error: sql error[%s][%s]' % (query, args))
            if self.__level == True:
                if args != None:
                    print('%s:[%s]') % (query, args)
                else:
                    print query

            return self.cur.execute(query, args)
        except Exception as e:
            error = common.format_exception(e)
            if error.count("Duplicate entry") > 0:
                print query
                return None
            print common.format_exception(e), query
            return None
Beispiel #12
0
 def parseDomVideo(self, url):
     try:
         soup = self.fetchUrl(url, header)
         div = soup.first("iframe", {'name': 'iFrame1'})
         if div != None:
             soup = self.fetchUrl(div.get("src"))
             scripts = soup.findAll("script")
             for script in scripts:
                 text = unquote(
                     script.text.replace("\"", "").replace("\/", "/"))
                 texts = text.split(",")
                 for item in texts:
                     match = regVideo.search(item)
                     if match != None:
                         videoUrl = match.group(1)
                         return "%s%s%s" % ("http", videoUrl, 'm3u8')
         print '没找到mp4'
         return None
     except Exception as e:
         print common.format_exception(e)
         return None
Beispiel #13
0
 def parseDomVideo(self, url):
     try:
         soup = self.fetchUrl(baseurl2 + url, header)
         iframe = soup.first("iframe")
         if iframe != None:
             text = self.fetchContentUrl(iframe.get("src"), header)
             match = regVideoM3.search(text)
             if match != None:
                 videoUrl = match.group(1)
                 return "%s%s%s" % ("http", videoUrl, 'm3u8')
         else:
             video = soup.first("table",
                                {"class": "plhin nthread_firstpost"})
             match = regVideoM3.search(video.text)
             if match != None:
                 videoUrl = match.group(1)
                 return "%s%s%s" % ("http", videoUrl, 'm3u8')
         print '没找到mp4'
         return None
     except Exception as e:
         print common.format_exception(e)
         return None
Beispiel #14
0
 def parseDomVideo(self, url):
     try:
         match = videoId3.search(url)
         if match != None:
             videoId = match.group(1)
             videoUrlId = "/index.php?m=vod-play-id-%s-src-1-num-1.html" % (
                 videoId)
             soup = self.fetchUrl(baseurl3 + videoUrlId, header3)
             DIV = soup.first("div", {"class": "dyplayer"})
             if DIV != None:
                 text = unquote(str(DIV.text))
                 texts = text.split(",")
                 for item in texts:
                     match = regVideoM3.search(item)
                     if match != None:
                         videoUrl = match.group(1)
                         return "%s%s%s" % ("http", videoUrl, 'm3u8')
         print '没找到mp4'
         return None
     except Exception as e:
         print common.format_exception(e)
         return None
Beispiel #15
0
 def parseDomVideo(self, url):
     header = {
         'User-Agent':
         'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Safari/537.36',
         "Referer": url
     }
     try:
         soup = self.fetchUrl(url, header)
         adiv = soup.first("div", {"class": "film_bar clearfix"})
         if adiv != None:
             ahref = adiv.first("a")
             if ahref != None:
                 soup = self.fetchUrl(ahref.get("href"), header)
                 div = soup.first("div", {'class': 'player_l'})
                 if div != None:
                     div.first("")
                     script = div.first('script')
                     if script != None:
                         text = unquote(str(script.text))
                         texts = text.split("$")
                         for item in texts:
                             match = regVideo.search(item)
                             if match != None:
                                 videoUrl = match.group(1)
                                 return "%s%s%s" % ("http", videoUrl,
                                                    'm3u8')
                         for item in texts:
                             match = shareVideo.search(item)
                             if match != None:
                                 videoUrl = "%s%s%s%s" % (
                                     "http", match.group(1), "/share/",
                                     match.group(2))
                                 return videoUrl
         print '没找到mp4'
         return None
     except Exception as e:
         print common.format_exception(e)
         return None
Beispiel #16
0
 def fetchTextData(self, url, channel):
     try:
         soup = self.fetchUrl(url)
         div = soup.first("div", {"class": "box list channel"})
         if div == None:
             print '没有数据', url
             return []
         datalist = div.findAll("li")
         objs = []
         sortType = dateutil.y_m_d()
         for item in datalist:
             ahref = item.first("a")
             if ahref!=None:
                 try:
                     obj = {}
                     span = ahref.first('span')
                     if span != None:
                         obj['fileDate'] = span.text
                     else:
                         obj['fileDate'] = ''
                     name = ahref.text.replace(obj['fileDate'], '')
                     obj['name'] = name.replace("【完】","")
                     print name
                     obj['url'] = ahref.get('href')
                     obj['baseurl'] = baseurl
                     obj['channel'] = channel
                     obj['updateTime'] = datetime.datetime.now()
                     ret = self.fetchText(ahref.get('href'))
                     if ret==None:
                         print '没有文章数据',ahref.get('href')
                         continue
                     obj['sortType'] = sortType
                     objs.append(obj)
                 except Exception as e:   
                     print  common.format_exception(e)
         return objs
     except Exception as e:
         print common.format_exception(e)
Beispiel #17
0
 def parseDomVideo(self, url):
   
     try:
         soup = self.fetchUrl(url)
         div = soup.first("div",{"class":"details-con2-body"})
         if div!=None:
             ahref = div.first("a")
             if ahref!=None:
                 soup = self.fetchUrl(ahref.get("href"))
                 player = soup.first("div",{"class":"player-box details-body"})
                 if player!=None:
                     script = player.first("script")
                     if script!=None:
                         content = unquote(str(script.text))
                         match = regVideo.search(content)
                         if match!=None:
                             obj = json.loads(match.group(1))
                             data = obj.get('Data',[])
                             urlData = []
                             for item in data:
                                 itemData = item.get('playurls',[])
                                 for itemUrl in itemData:
                                     for itemurlOne in itemUrl:
                                         if itemurlOne.count('http')>0:
                                             urlData.append(itemurlOne)
                             for item in urlData:
                                 if item.count('m3u8'):
                                     return item
                             for item in urlData:
                                 if item.count('/share/'):
                                     return item
                             if len(urlData)>0:
                                 return urlData[0]
         print '没找到mp4'
         return None
     except Exception as e:
         print common.format_exception(e)
         return None
Beispiel #18
0
    def parseDomVideo(self, url):
        try:
            soup = self.fetchUrl(url)
            iframe = soup.first("iframe")
            if iframe != None:
                v = iframe.get("src").replace("&#46;", ".")
                match = video_iframe.search(v)
                if match != None:
                    id = v.replace("https://baiduyunbo.com/?id=", "")
                    return video_m3u8 % (id)
                else:
                    soup = self.fetchUrlWithBase(v)
                    scripts = soup.findAll("script")
                    for script in scripts:
                        match = video_mp4.search(script.text)
                        if match != None:
                            return "%s%s%s" % ("http", match.group(1), "mp4")

            print url, '没有找到mp4'
            return None
        except Exception as e:
            print common.format_exception(e)
            return None
Beispiel #19
0
    def parseDomVideo(self, base, url):
        try:
            soup = self.fetchUrl(url, header)
            divs = soup.findAll("div")
            urls = []
            for div in divs:
                ahref = div.first("a")
                divTitle = div.first("div")
                if ahref != None and divTitle != None and ahref.get(
                        "rel") != None:
                    h5 = div.first("h5")
                    name = divTitle.text
                    if h5 != None:
                        name = h5.text
                    obj = {}
                    obj['name'] = name
                    obj['url'] = ahref.get("href")
                    urls.append(obj)

            return urls
        except Exception as e:
            print common.format_exception(e)
            return None
Beispiel #20
0
    def fetchUrl(self, url, aheader=header):
        count = 0
        while count < maxCount:
            try:
                req = urllib2.Request(baseurl + url, headers={
                    'User-Agent': 'Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9.2.13) Gecko/20101203 Firefox/3.6.13', "Referer":baseurl})
                req.encoding = 'utf-8'
                response = urllib2.urlopen(req, timeout=300)
                gzipped = response.headers.get(
                    'Content-Encoding')  # 查看是否服务器是否支持gzip
                content = response.read()
                if gzipped:
                    content = zlib.decompress(
                        content, 16 + zlib.MAX_WBITS)  # 解压缩,得到网页源码
                soup = BeautifulSoup(content.decode('gbk', errors='replace'))
                return soup
            except Exception as e:
                print common.format_exception(e)
                print '打开页面错误,重试', baseurl + url, '次数', count
                count = count + 1

        print '打开页面错误,重试3次还是错误', url
        return BeautifulSoup('')
Beispiel #21
0
 def parseDomVideo(self, url):
     try:
         ID = url.replace(".html","").replace("/klav-video/","")
         vediourl= "/klav-play/%s-1-1.html"%(ID)
         soup = self.fetchUrl(baseurl7+vediourl, header)
         div   = soup.first("div",{"class":"pages"})
         if div !=None:
                 texts = div.text.split(";")
                 for  text in texts:
                     match = regVideoMp4.search(text)
                     match = regVideoM3.search(text)
                     if match!=None:
                         videoUrl =match.group(1)
                         return "%s%s%s"%("http",videoUrl,'m3u8')
                     if match!=None:
                         videoUrl =match.group(1)
                         return "%s%s%s"%("http",videoUrl,'mp4')
                     
         print '没找到mp4'
         return None
     except Exception as e:
         print common.format_exception(e)
         return None
Beispiel #22
0
    def fetchUrlWithBase(self, url, aheader=header):
        count = 0
        while count < maxCount:
            try:
                req = urllib2.Request(url, headers={'Cookie':"td_cookie=18446744069599001696; UM_distinctid=16267a77486203-0a34f7eb9f837-454c092b-1fa400-16267a7748726f; CNZZDATA4033785=cnzz_eid%3D1967344694-1522153663-null%26ntime%3D1522153663; CNZZDATA1263493226=2025093065-1522155903-null%7C1522155903; PHPSESSID=cqppj1tg9v8tf27j95ogqogjs1; td_cookie=18446744069599206493; WSKY=6c172; jiathis_rdc=%7B%22http%3A//www.zxdy.cc/vod/22266.html%22%3A1739039602%2C%22http%3A//www.zxdy.cc/play/22266-0-1.html%22%3A1739044927%2C%22http%3A//www.zxdy.cc/Uploads/https%3A//tupian.tupianzy.com/pic/upload/vod/2018-03-03/201803031520062617.jpg%22%3A1739118415%2C%22http%3A//www.zxdy.cc/list/1-p-3-0.html%22%3A1739129605%2C%22http%3A//www.zxdy.cc/list/1-p-1-0.html%22%3A1739216767%2C%22http%3A//www.zxdy.cc/list/9-p-1-0.html%22%3A1739358031%2C%22http%3A//www.zxdy.cc/list/9-p-2-0.html%22%3A1739371664%2C%22http%3A//www.zxdy.cc/Uploads/https%3A//wx3.sinaimg.cn/mw690/005w5c6ogy1fjuo496v5uj30tu15ok3k.jpg%22%3A1739577535%2C%22http%3A//www.zxdy.cc/Uploads/https%3A//img.alicdn.com/imgextra/i4/2264228004/TB2UynHnQqvpuFjSZFhXXaOgXXa_%21%212264228004.jpg%22%3A1739585958%2C%22http%3A//www.zxdy.cc/%22%3A1739586271%2C%22http%3A//www.zxdy.cc/vod/5128.html%22%3A1739763188%2C%22http%3A//www.zxdy.cc/vod/1.html%22%3A1739772004%2C%22http%3A//www.zxdy.cc/play/1-0-1.html%22%3A1739777508%2C%22http%3A//www.zxdy.cc/vod/4063.html%22%3A1739811363%2C%22http%3A//www.zxdy.cc/play/4063-0-2.html%22%3A1739820736%2C%22http%3A//www.zxdy.cc/list/11-p-1-0.html%22%3A1739855919%2C%22http%3A//www.zxdy.cc/vod/22236.html%22%3A0%7C1522158279843%2C%22http%3A//www.zxdy.cc/play/22236-0-1.html%22%3A%220%7C1522158307282%22%7D"
                    ,'User-Agent': 'Mozilla/5.0 (compatible; Baiduspider-render/2.0; +http://www.baidu.com/search/spider.html, Windows; U; Windows NT 5.1; zh-CN; rv:1.9.2.13) Gecko/20101203 Firefox/3.6.13', "Referer":url})
                req.encoding = 'utf-8'
                response = urllib2.urlopen(req, timeout=300)
                gzipped = response.headers.get(
                    'Content-Encoding')  # 查看是否服务器是否支持gzip
                content = response.read().decode('utf8', errors='replace')
                if gzipped:
                    content = zlib.decompress(
                        content, -zlib.MAX_WBITS)  # 解压缩,得到网页源码
                soup = BeautifulSoup(content)
                return soup
            except Exception as e:
                print common.format_exception(e)
                print '打开页面错误,重试',  url, '次数', count
                count = count + 1

        print '打开页面错误,重试3次还是错误', url
        return BeautifulSoup('')
Beispiel #23
0
    def fetchUrl(self, url):
        count = 0
        while count < maxCount:
            try:
                req = urllib2.Request(
                    url,
                    headers={
                        "Cookie":
                        "zenid=b227c2098ac37d540e4579fb024e9ba9; __utma=62982011.325664695.1514618636.1514618636.1514618636.1; __utmc=62982011; __utmz=62982011.1514618636.1.1.utmcsr=seqing.one|utmccn=(referral)|utmcmd=referral|utmcct=/2059.html; __atuvc=7%7C52; __utmb=62982011.35.10.1514618636",
                        "Upgrade-Insecure-Requests": "1",
                        'User-Agent':
                        'Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9.2.13) Gecko/20101203 Firefox/3.6.13',
                        "Referer": "http://www.eroti-cart.com"
                    })
                req.encoding = 'utf-8'
                opener = urllib2.build_opener()
                opener.add_handler(SmartRedirectHandler())
                urllib2.install_opener(opener)
                response = urllib2.urlopen(req, timeout=300)
                gzipped = response.headers.get(
                    'Content-Encoding')  # 查看是否服务器是否支持gzip
                content = response.read().decode('utf8', errors='replace')
                if gzipped:
                    content = zlib.decompress(content, 16 +
                                              zlib.MAX_WBITS)  # 解压缩,得到网页源码
#                 cmd = ("wget %s" % (url))
#                 textlist = os.popen(cmd).readlines()

                soup = BeautifulSoup(content)
                return soup
            except Exception as e:
                print common.format_exception(e)
                print '打开页面错误,重试', url, '次数', count
                count = count + 1

        print '打开页面错误,重试3次还是错误', url
        return BeautifulSoup('')
Beispiel #24
0
 def fetchTextData(self, url, channel):
     try:
         soup = self.fetchUrl(baseurl+url)
         div = soup.first("div", {"class": "novelList"})
         if div == None:
             print '没有数据', url
             return []
         datalist = div.findAll("a")
         objs = []
         sortType = dateutil.y_m_d()
         for item in datalist:
             try:
                 obj = {}
                 span = item.first('div',{"class":"pull-right date    "})
                 if span != None:
                     obj['fileDate'] = span.text
                 else:
                     obj['fileDate'] = ''
                 name = item.first("div",{"class":"pull-left"}).text
                 obj['name'] = name.replace("【完】","")
                 print name
                 obj['url'] = item.get('href')
                 obj['baseurl'] = baseurl
                 obj['channel'] = channel
                 obj['updateTime'] = datetime.datetime.now()
                 ret = self.fetchText(item.get('href'))
                 if ret==None:
                     print '没有文章数据',item.get('href')
                     continue
                 obj['sortType'] = sortType
                 objs.append(obj)
             except Exception as e:   
                 print  common.format_exception(e)
         return objs
     except Exception as e:
         print common.format_exception(e)
Beispiel #25
0
 def parseDomVideo(self, url):
     header = {
         'User-Agent':
         'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Safari/537.36',
         "Referer": url
     }
     obj = {}
     try:
         soup = self.fetchUrl(url, header)
         playtool = soup.first("div", {'class': 'play-wapper'})
         if playtool != None:
             obj['pic'] = playtool.first('img').get('src')
             ahrefs = playtool.findAll('a')
             for ahref in ahrefs:
                 match = regVideo.search(ahref.text)
                 if match != None:
                     videoUrl = match.group(1)
                     obj['mp4'] = "%s%s%s" % ("http", videoUrl, 'm3u8')
                     return obj
         print '没找到mp4'
         return None
     except Exception as e:
         print common.format_exception(e)
         return None
Beispiel #26
0
    def fetchImgItemsData(self, url, channel):
        soup = self.fetchUrl(baseurl4, url)
        div = soup.first("div", {"class": "zxlist"})
        if div == None:
            print '没有数据', url
            return []
        datalist = div.findAll("ul")
        objs = []
        sortType = dateutil.y_m_d()
        for item in datalist:
            ahref = item.first("a")
            if ahref != None:
                try:
                    obj = {}
                    obj['fileDate'] = item.first('li', {"class": "zxsyd"}).text
                    name = ahref.text
                    obj['name'] = name
                    obj['url'] = ahref.get('href')
                    obj['baseurl'] = baseurl4
                    obj['channel'] = channel
                    obj['updateTime'] = datetime.datetime.now()

                    pics = self.fetchImgs(obj['url'])
                    if len(pics) == 0:
                        print '没有 图片文件--', obj['url'], '---', url
                        continue
                    obj['picList'] = pics
                    obj['showType'] = 3
                    obj['pics'] = len(pics)
                    obj['sortType'] = sortType
                    print name, pics[0], '  url=', obj['url'], '  图片数量=', len(
                        pics)
                    objs.append(obj)
                except Exception as e:
                    print common.format_exception(e)
        return objs
Beispiel #27
0
 def parseDomVideo(self, url):
     try:
         soup = self.fetchUrl(url, header)
         adiv = soup.first("div",{"class":"playBar"})
         if adiv!=None:
             ahref = adiv.first('a')
             if ahref!=None:
                 soup = self.fetchUrl(ahref.get("href"), header)
                 style = soup.first("ul",{"style":"text-align:center;;"})
                 if style!=None:
                     script  = style.first("script")
                     if script!=None:
                         text = unquote(str(self.fetchUrl(script.get("src"))))
                         texts = text.split("$")
                         for item in texts:
                             match = regVideo.search(item)
                             if match!=None:
                                 videoUrl =match.group(1)
                                 return "%s%s%s"%("http",videoUrl,'m3u8')
         print '没找到mp4'
         return None
     except Exception as e:
         print common.format_exception(e)
         return None
Beispiel #28
0
    def run(self):
        dbVPN = db.DbVPN()
        ops = db_ops.DbOps(dbVPN)
        ops.inertTextChannel(self.t_obj)
        dbVPN.commit()
        print self.t_obj
        try:
            channel = self.t_obj['url']
            for i in range(1, maxTextPage):
                url = self.t_obj['url'].replace(".html",
                                                "-") + str(i) + ".html"
                count = self.update(url, ops, channel)
                dbVPN.commit()
                if count == 0:
                    break
            else:
                self.update(url, ops, channel)
                dbVPN.commit()

            dbVPN.close()
        except Exception as e:
            print common.format_exception(e)
            dbVPN.commit()
            dbVPN.close()
Beispiel #29
0
 def fetchHeadChannel(self):
     try:
         soup = self.fetchUrl("/")
         menu = soup.first("div", {"id": "nav"})
         if menu == None:
             print '没找到对应的频道 ', baseurl
             return None
         lis = menu.findAll("a")
         ret = []
         for a in lis:
             print a
             if a != None and a.text.find('首页') == -1:
                 row = {}
                 row['name'] = a.text
                 row['baseurl'] = baseurl
                 row['url'] = a.get('href')
                 row['channelType'] = 'normal'
                 row['updateTime'] = datetime.datetime.now()
                 row['channel'] = baseurl.replace("http://", "").replace(
                     "https://", "") + channel_pre + a.get('href')
                 ret.append(row)
         return ret
     except Exception as e:
         print common.format_exception(e)
Beispiel #30
0
    def fetchContentUrlWithBase(self, url):
        count = 0
        while count < maxCount:
            try:
                req = urllib2.Request(baseurl + url, headers={
                    'User-Agent': 'Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9.2.13) Gecko/20101203 Firefox/3.6.13', "Referer":baseurl})
                req.encoding = 'utf-8'
                response = urllib2.urlopen(req, timeout=300)
                content = response.read()
                gzipped = response.headers.get(
                    'Content-Encoding')  # 查看是否服务器是否支持gzip
                if gzipped:
                    content = zlib.decompress(
                        content, 16 + zlib.MAX_WBITS)  # 解压缩,得到网页源码
                return content
            except Exception as e:
                print common.format_exception(e)
                print '打开页面错误,重试', baseurl+url, '次数', count
                count = count + 1

        print '打开页面错误,重试3次还是错误', baseurl+url
        return ''
# p = BaseParse()
# print p.fetchContentUrlWithBase("/list/?37-1.html", header)