Exemple #1
0
 def start_requests(self):
     if self.type == self.type_movie:
         self.cursor.execute(
             'select movie_douban.id from movie_douban '
             'left join image_movie_douban '
             'on movie_douban.id=image_movie_douban.id_movie_douban '
             'where image_movie_douban.id_movie_douban is null '
             'limit {}'.format(default.SELECT_LIMIT))
         for id, in self.cursor.fetchall():
             yield scrapy.Request(url="{}{}{}".format(
                 config.URL_IMAGE_MOVIE_START, id,
                 config.URL_IMAGE_MOVIE_END),
                                  cookies=config.get_cookie_douban(),
                                  meta={'id': id},
                                  callback=self.parse)
     elif self.type == self.type_celebrity:
         self.cursor.execute(
             'select celebrity_douban.id from celebrity_douban '
             'left join image_celebrity_douban '
             'on celebrity_douban.id=image_celebrity_douban.id_celebrity_douban '
             'where image_celebrity_douban.id_celebrity_douban is null '
             'limit {}'.format(default.SELECT_LIMIT))
         for id, in self.cursor.fetchall():
             yield scrapy.Request(url="{}{}{}".format(
                 config.URL_IMAGE_CELEBRITY_START, id,
                 config.URL_IMAGE_CELEBRITY_END),
                                  cookies=config.get_cookie_douban(),
                                  meta={'id': id},
                                  callback=self.parse)
Exemple #2
0
 def start_requests(self):
     self.cursor.execute(
         "select id from trailer_movie_douban where url_video='' limit {}".format(default.SELECT_LIMIT))
     for id, in self.cursor.fetchall():
         yield scrapy.Request(url="{}{}/".format(config.URL_TRAILER_MOVIE, id),
                              cookies=config.get_cookie_douban(),
                              meta={'id': id}, callback=self.parse)
Exemple #3
0
 def start_requests(self):
     self.cursor.execute('select id from celebrity_douban limit {}'.format(
         default.SELECT_LIMIT))
     for id, in self.cursor.fetchall():
         yield scrapy.Request(url="{}{}/".format(config.URL_CELEBRITY, id),
                              cookies=config.get_cookie_douban(),
                              meta={'id': id},
                              callback=self.parse)
 def start_requests(self):
     self.cursor.execute('select id from movie_douban '
                         'where update_date=0 limit {}'.format(
                             default.SELECT_LIMIT))
     for id, in self.cursor.fetchall():
         yield scrapy.Request(url="{}{}/".format(config.URL_MOVIE, id),
                              cookies=config.get_cookie_douban(),
                              meta={'id': id},
                              callback=self.parse)
Exemple #5
0
 def start_requests(self):
     self.cursor.execute('select movie_douban.id from movie_douban '
                         'left join comment_movie_douban '
                         'on movie_douban.id=comment_movie_douban.id_movie_douban '
                         'where comment_movie_douban.id_movie_douban is null '
                         'limit {}'.format(default.SELECT_LIMIT))
     for id, in self.cursor.fetchall():
         yield scrapy.Request(url="{}{}{}".format(config.URL_COMMENT_MOVIE_START, id, config.URL_COMMENT_MOVIE_END),
                              cookies=config.get_cookie_douban(),
                              meta={'id': id}, callback=self.parse)
 def start_requests(self):
     for num in range(0, config.NEW_MOVIE_MAX, 15):
         yield scrapy.Request(url="{}{}".format(config.URL_MOVIE_NEW, num),
                              cookies=config.get_cookie_douban(),
                              meta={'num': num},
                              callback=self.parse)
 def start_requests(self):
     # 从imdb获取待请求电影列表
     if self.type == self.type_movie_imdb:
         self.cursor.execute(
             'select movie_imdb.id,movie_imdb.start_year from movie_imdb '
             'left join movie_douban '
             'on movie_imdb.id=movie_douban.id_movie_imdb '
             'where movie_douban.id_movie_imdb is null '
             'and movie_imdb.is_douban_updated=0 '
             'limit {}'.format(default.SELECT_LIMIT))
         for id, start_year in self.cursor.fetchall():
             yield scrapy.Request(url=config.URL_SEARCH_MOVIE + 'tt' +
                                  '%07d' % id,
                                  meta={
                                      'id': id,
                                      'start_year': start_year
                                  },
                                  cookies=config.get_cookie_douban(),
                                  callback=self.parse)
     # 从scene获取待请求电影列表
     elif self.type == self.type_movie_scene:
         self.cursor.execute(
             'select id,name_zh,start_year from movie_scene '
             'where id_movie_douban=0 '
             'limit {}'.format(default.SELECT_LIMIT))
         for id, name_zh, start_year in self.cursor.fetchall():
             result = self.match_movie(name_zh, start_year)
             # 在movie_douban中匹配到指定电影,更新数据库
             if result[0]:
                 item_movie_scene = MovieScene()
                 item_movie_scene['id'] = id
                 item_movie_scene['id_movie_douban'] = result[1]
                 self.cursor_update.execute(
                     'insert into movie_scene(id,id_movie_douban) values({0},{1}) '
                     'on duplicate key update '
                     'id_movie_douban={1}'.format(id, result[1]))
                 print('scene (mysql) ---------')
                 print(id)
                 print(result[1])
                 self.logger.info(
                     'get mysql- list success,id:{},name:{},type:{}'.format(
                         id, name_zh, self.type))
             # 匹配失败,采用search方式
             else:
                 yield scrapy.Request(url=config.URL_SEARCH_MOVIE + name_zh,
                                      meta={
                                          'id': id,
                                          'start_year': start_year
                                      },
                                      cookies=config.get_cookie_douban(),
                                      callback=self.parse)
         self.conn.commit()
     # 从resource获取待请求电影列表,根据 名称,年份 匹配电影
     elif self.type == self.type_movie_resource:
         self.cursor.execute(
             'select id,id_movie_imdb,name_zh,create_year from resource_movie '
             'where id_movie_douban=0 '
             'and id_website_resource> 100 '
             'and id_type_resource>=100 '
             'limit {}'.format(default.SELECT_LIMIT))
         for id, id_movie_imdb, name_zh, create_year in self.cursor.fetchall(
         ):
             # 根据imdb编号找到对应电影
             if id_movie_imdb != 0:
                 cursor2 = self.conn.cursor()
                 cursor2.execute(
                     'select id from movie_douban where id_movie_imdb={}'.
                     format(id_movie_imdb))
                 result = cursor2.fetchall()
                 if len(result) == 1:
                     id_movie_douban = result[0][0]
                 # search根据imdb编号找到对应电影
                 else:
                     yield scrapy.Request(url=config.URL_SEARCH_MOVIE +
                                          'tt' + '%07d' % id_movie_imdb,
                                          meta={
                                              'id': id,
                                              'start_year': create_year
                                          },
                                          callback=self.parse)
                     continue
             # idmb编号为0,根据电影名和年份找到对应电影
             else:
                 result = self.match_movie(name_zh, create_year)
                 id_movie_douban = result[1] if result[0] else 0
             # 在movie_douban中匹配到指定电影
             if id_movie_douban != 0:
                 self.cursor_update.execute(
                     'insert into resource_movie(id,id_movie_douban) '
                     'values ({0},{1}) '
                     'on duplicate key update '
                     'id_movie_douban={1}}'.format(id, id_movie_douban))
                 print('resource (mysql) ----------')
                 print(id)
                 print(id_movie_douban)
             # 匹配失败,采用search方式
             else:
                 yield scrapy.Request(url='{}{}'.format(
                     config.URL_SEARCH_MOVIE, name_zh),
                                      meta={
                                          'id': id,
                                          'start_year': create_year
                                      },
                                      callback=self.parse)
         self.conn.commit()
     # 从imdb获取待请求影人列表
     elif self.type == self.type_celebrity_imdb:
         self.cursor.execute(
             'select celebrity_imdb.id from celebrity_imdb '
             'left join celebrity_douban '
             'on celebrity_imdb.id=celebrity_douban.id_celebrity_imdb '
             'where celebrity_douban.id_celebrity_imdb is null '
             'and celebrity_imdb.is_douban_updated=0 '
             'limit {}'.format(default.SELECT_LIMIT))
         for id, in self.cursor.fetchall():
             yield scrapy.Request(url=config.URL_SEARCH_MOVIE + 'nm' +
                                  '%07d' % id,
                                  meta={'id': id},
                                  cookies=config.get_cookie_douban(),
                                  callback=self.parse)
     # 从scene获取待请求影人列表
     elif self.type == self.type_celebrity_scene:
         self.cursor.execute('select id,name_en from celebrity_scene '
                             'where id_celebrity_douban=0 and name_en!="" '
                             'limit {}'.format(default.SELECT_LIMIT))
         for id, name_en in self.cursor.fetchall():
             # 从celebrity_douban中匹配影人
             cursor2 = self.conn.cursor()
             cursor2.execute('select id from celebrity_douban '
                             'where name_origin="{}"'.format(name_en))
             result = cursor2.fetchall()
             # 匹配到唯一的指定影人
             if len(result) == 1:
                 self.cursor_update.execute(
                     'insert into celebrity_scene(id,id_celebrity_douban) values ({0},{1}) '
                     'on duplicate key update '
                     'id_celebrity_douban={1} '.format(id, result[0][0]))
                 print('celebrity_scene (mysql) ----------')
                 print(id)
                 print(result[0][0])
             # 匹配失败,采用search
             else:
                 yield scrapy.Request(url=config.URL_SEARCH_MOVIE + name_en,
                                      meta={'id': id},
                                      cookies=config.get_cookie_douban(),
                                      callback=self.parse)
         self.conn.commit()