Esempio n. 1
0
 def get_movie(self, url):
     try:
         sqlList = []
         info = {
             'id': '',
             'name': '',
             'date': None,
             'score': '0',
             'introduction': '',
             'download': '',
         }
         pattern = re.compile(r'.*/(?P<id>\d*)\.html')
         info['id'] = re.search(pattern, url).group('id')
         if info['id'] in self.idList:
             return
         html = Grab.get_content(url).replace(
             'xmlns="http://www.w3.org/1999/xhtml" /',
             '').replace('xmlns="http://www.w3.org/1999/xhtml"',
                         '').decode('gb2312', 'ignore')
         doc = pq(html)
         info['download'] = doc('#Zoom table td:first-child').eq(0).text()
         name = doc('.bd3r .co_area2 .title_all h1').text()
         info['name'] = re.search(r'.*《(.*)》.*', name).group(1)
         # name = re.search(r'.*阳光电影www\.ygdy8\.com\.(\W*\d?)\..*', info['download']).group(1).strip()
         content = doc('#Zoom p:first-child')
         if not content:
             return
         content = content.remove('br').html().replace(' ', '')
         pattern = re.compile(
             r'.*◎年代.*(?:.*/)?(?P<date>\d{4}[-年]\d{2}[-月]\d{2}日?)\(.*')
         res = re.match(pattern, content)
         if res:
             info['date'] = res.group('date').strip().replace(
                 '年', '-').replace('月', '-').replace('日', '-')
         pattern = re.compile(r'.*◎豆瓣评分(?P<score>.*)/10 from .* users.*')
         res = re.match(pattern, content)
         if res:
             info['score'] = res.group('score').strip().replace(',', '.')
             if not info['score']:
                 info['score'] = '0'
         else:
             pattern = re.compile(
                 r'.*◎IMDb评分(?P<score>.*)/10 from .* users.*', re.I)
             res = re.match(pattern, content)
             if res:
                 info['score'] = res.group('score').strip().replace(
                     ',', '.')
                 if not info['score']:
                     info['score'] = '0'
         pattern = re.compile(r'.*◎简介(?P<introduction>.*)◎获奖情况.*<img.*')
         res = re.match(pattern, content)
         if res:
             info['introduction'] = res.group(
                 'introduction').strip().replace('\'', '\\\'')
         else:
             pattern = re.compile(r'.*◎简介(?P<introduction>.*).*<img.*')
             res = re.match(pattern, content)
             if res:
                 info['introduction'] = res.group(
                     'introduction').strip().replace('\'', '\\\'')
         if len(info['introduction']) >= 1024:
             info['introduction'] = ''
         # print(info['introduction'], url)
         # return
         sql = "select * from bs_movie where name='%s' or id='%s'" % (
             info['name'], info['id'])
         res = DB.fetchone(sql)
         if res:
             if int(info['id']) > res[0]:
                 sql = "delete from bs_movie where id='%s'" % res[0]
                 sqlList.append(sql)
                 # self.sqlList.append(sql)
             else:
                 return
         if info['date']:
             sql = "insert into bs_movie (id,name,date,score,introduction,url,download) values ('%s','%s','%s','%s','%s','%s','%s')" % (
                 info['id'], info['name'], info['date'], info['score'],
                 info['introduction'], url, info['download'])
         else:
             sql = "insert into bs_movie (id,name,date,score,introduction,url,download) values ('%s','%s',NULL,'%s','%s','%s','%s')" % (
                 info['id'], info['name'], info['score'],
                 info['introduction'], url, info['download'])
         # print(sql)
         sqlList.append(sql)
         if len(sqlList) == 1:
             DB.execute(sqlList[0])
         else:
             DB.doTrans(sqlList)
         # self.sqlList.append(sql)
     except Exception as ex:
         print(str(ex), url)