def start_requests(self): if self.type == self.type_movie: self.cursor.execute( 'select movie_douban.id from movie_douban ' 'left join image_movie_douban ' 'on movie_douban.id=image_movie_douban.id_movie_douban ' 'where image_movie_douban.id_movie_douban is null ' 'limit {}'.format(default.SELECT_LIMIT)) for id, in self.cursor.fetchall(): yield scrapy.Request(url="{}{}{}".format( config.URL_IMAGE_MOVIE_START, id, config.URL_IMAGE_MOVIE_END), cookies=config.get_cookie_douban(), meta={'id': id}, callback=self.parse) elif self.type == self.type_celebrity: self.cursor.execute( 'select celebrity_douban.id from celebrity_douban ' 'left join image_celebrity_douban ' 'on celebrity_douban.id=image_celebrity_douban.id_celebrity_douban ' 'where image_celebrity_douban.id_celebrity_douban is null ' 'limit {}'.format(default.SELECT_LIMIT)) for id, in self.cursor.fetchall(): yield scrapy.Request(url="{}{}{}".format( config.URL_IMAGE_CELEBRITY_START, id, config.URL_IMAGE_CELEBRITY_END), cookies=config.get_cookie_douban(), meta={'id': id}, callback=self.parse)
def start_requests(self): self.cursor.execute( "select id from trailer_movie_douban where url_video='' limit {}".format(default.SELECT_LIMIT)) for id, in self.cursor.fetchall(): yield scrapy.Request(url="{}{}/".format(config.URL_TRAILER_MOVIE, id), cookies=config.get_cookie_douban(), meta={'id': id}, callback=self.parse)
def start_requests(self): self.cursor.execute('select id from celebrity_douban limit {}'.format( default.SELECT_LIMIT)) for id, in self.cursor.fetchall(): yield scrapy.Request(url="{}{}/".format(config.URL_CELEBRITY, id), cookies=config.get_cookie_douban(), meta={'id': id}, callback=self.parse)
def start_requests(self): self.cursor.execute('select id from movie_douban ' 'where update_date=0 limit {}'.format( default.SELECT_LIMIT)) for id, in self.cursor.fetchall(): yield scrapy.Request(url="{}{}/".format(config.URL_MOVIE, id), cookies=config.get_cookie_douban(), meta={'id': id}, callback=self.parse)
def start_requests(self): self.cursor.execute('select movie_douban.id from movie_douban ' 'left join comment_movie_douban ' 'on movie_douban.id=comment_movie_douban.id_movie_douban ' 'where comment_movie_douban.id_movie_douban is null ' 'limit {}'.format(default.SELECT_LIMIT)) for id, in self.cursor.fetchall(): yield scrapy.Request(url="{}{}{}".format(config.URL_COMMENT_MOVIE_START, id, config.URL_COMMENT_MOVIE_END), cookies=config.get_cookie_douban(), meta={'id': id}, callback=self.parse)
def start_requests(self): for num in range(0, config.NEW_MOVIE_MAX, 15): yield scrapy.Request(url="{}{}".format(config.URL_MOVIE_NEW, num), cookies=config.get_cookie_douban(), meta={'num': num}, callback=self.parse)
def start_requests(self): # 从imdb获取待请求电影列表 if self.type == self.type_movie_imdb: self.cursor.execute( 'select movie_imdb.id,movie_imdb.start_year from movie_imdb ' 'left join movie_douban ' 'on movie_imdb.id=movie_douban.id_movie_imdb ' 'where movie_douban.id_movie_imdb is null ' 'and movie_imdb.is_douban_updated=0 ' 'limit {}'.format(default.SELECT_LIMIT)) for id, start_year in self.cursor.fetchall(): yield scrapy.Request(url=config.URL_SEARCH_MOVIE + 'tt' + '%07d' % id, meta={ 'id': id, 'start_year': start_year }, cookies=config.get_cookie_douban(), callback=self.parse) # 从scene获取待请求电影列表 elif self.type == self.type_movie_scene: self.cursor.execute( 'select id,name_zh,start_year from movie_scene ' 'where id_movie_douban=0 ' 'limit {}'.format(default.SELECT_LIMIT)) for id, name_zh, start_year in self.cursor.fetchall(): result = self.match_movie(name_zh, start_year) # 在movie_douban中匹配到指定电影,更新数据库 if result[0]: item_movie_scene = MovieScene() item_movie_scene['id'] = id item_movie_scene['id_movie_douban'] = result[1] self.cursor_update.execute( 'insert into movie_scene(id,id_movie_douban) values({0},{1}) ' 'on duplicate key update ' 'id_movie_douban={1}'.format(id, result[1])) print('scene (mysql) ---------') print(id) print(result[1]) self.logger.info( 'get mysql- list success,id:{},name:{},type:{}'.format( id, name_zh, self.type)) # 匹配失败,采用search方式 else: yield scrapy.Request(url=config.URL_SEARCH_MOVIE + name_zh, meta={ 'id': id, 'start_year': start_year }, cookies=config.get_cookie_douban(), callback=self.parse) self.conn.commit() # 从resource获取待请求电影列表,根据 名称,年份 匹配电影 elif self.type == self.type_movie_resource: self.cursor.execute( 'select id,id_movie_imdb,name_zh,create_year from resource_movie ' 'where id_movie_douban=0 ' 'and id_website_resource> 100 ' 'and id_type_resource>=100 ' 'limit {}'.format(default.SELECT_LIMIT)) for id, id_movie_imdb, name_zh, create_year in self.cursor.fetchall( ): # 根据imdb编号找到对应电影 if id_movie_imdb != 0: cursor2 = self.conn.cursor() cursor2.execute( 'select id from movie_douban where id_movie_imdb={}'. format(id_movie_imdb)) result = cursor2.fetchall() if len(result) == 1: id_movie_douban = result[0][0] # search根据imdb编号找到对应电影 else: yield scrapy.Request(url=config.URL_SEARCH_MOVIE + 'tt' + '%07d' % id_movie_imdb, meta={ 'id': id, 'start_year': create_year }, callback=self.parse) continue # idmb编号为0,根据电影名和年份找到对应电影 else: result = self.match_movie(name_zh, create_year) id_movie_douban = result[1] if result[0] else 0 # 在movie_douban中匹配到指定电影 if id_movie_douban != 0: self.cursor_update.execute( 'insert into resource_movie(id,id_movie_douban) ' 'values ({0},{1}) ' 'on duplicate key update ' 'id_movie_douban={1}}'.format(id, id_movie_douban)) print('resource (mysql) ----------') print(id) print(id_movie_douban) # 匹配失败,采用search方式 else: yield scrapy.Request(url='{}{}'.format( config.URL_SEARCH_MOVIE, name_zh), meta={ 'id': id, 'start_year': create_year }, callback=self.parse) self.conn.commit() # 从imdb获取待请求影人列表 elif self.type == self.type_celebrity_imdb: self.cursor.execute( 'select celebrity_imdb.id from celebrity_imdb ' 'left join celebrity_douban ' 'on celebrity_imdb.id=celebrity_douban.id_celebrity_imdb ' 'where celebrity_douban.id_celebrity_imdb is null ' 'and celebrity_imdb.is_douban_updated=0 ' 'limit {}'.format(default.SELECT_LIMIT)) for id, in self.cursor.fetchall(): yield scrapy.Request(url=config.URL_SEARCH_MOVIE + 'nm' + '%07d' % id, meta={'id': id}, cookies=config.get_cookie_douban(), callback=self.parse) # 从scene获取待请求影人列表 elif self.type == self.type_celebrity_scene: self.cursor.execute('select id,name_en from celebrity_scene ' 'where id_celebrity_douban=0 and name_en!="" ' 'limit {}'.format(default.SELECT_LIMIT)) for id, name_en in self.cursor.fetchall(): # 从celebrity_douban中匹配影人 cursor2 = self.conn.cursor() cursor2.execute('select id from celebrity_douban ' 'where name_origin="{}"'.format(name_en)) result = cursor2.fetchall() # 匹配到唯一的指定影人 if len(result) == 1: self.cursor_update.execute( 'insert into celebrity_scene(id,id_celebrity_douban) values ({0},{1}) ' 'on duplicate key update ' 'id_celebrity_douban={1} '.format(id, result[0][0])) print('celebrity_scene (mysql) ----------') print(id) print(result[0][0]) # 匹配失败,采用search else: yield scrapy.Request(url=config.URL_SEARCH_MOVIE + name_en, meta={'id': id}, cookies=config.get_cookie_douban(), callback=self.parse) self.conn.commit()