def _get_concreate_extractors(self): return [ XpathExtractor( 'region', self.html, u"//div[@id='info']/span[text()='制片国家/地区:']/following-sibling::text()[1]" ), ValueExtract('dbid', self._get_urlid()), ValueExtract('kind', self.kind), ]
def _get_extractors(self): common_extractors = [ XpathExtractor('title', self.html, "//title/text()", postback=self.get_title_name), XpathExtractor( 'subtitle', self.html, "//div[@id='content']/h1/span[@property='v:itemreviewed']/text()", postback=self.get_subtitle), XpathExtractor( 'akas', self.html, u"//div[@id='info']/span[text()='又名:']/following-sibling::text()", postback=self.get_akas), XpathExtractor('posterurl', self.html, "//div[@id='mainpic']//img[@rel='v:image']/@src", postback=self.get_posterurl), XpathExtractor( 'genres', self.html, "//div[@id='info']/span[@property='v:genre']/text()", islist=True), XpathExtractor( 'year', self.html, "//div[@id='content']/h1/span[@class='year']/text()", postback=self.get_year_number), XpathExtractor('stars', self.html, "//a[@rel='v:starring']/text()", islist=True, postback=split_item), XpathExtractor( 'imdbid', self.html, u"//div[@id='info']/span[text()='IMDb链接:']/following-sibling::a/text()" ), XpathExtractor( 'description', self.html, "normalize-space(string(//span[@property='v:summary']))"), ValueExtract('source', 'douban'), ValueExtract('id', self.get_dbid()), ValueExtract('url', self.url), ] total_extractors = common_extractors total_extractors.extend(self._get_concreate_extractors()) return total_extractors
def _get_concreate_extractors(self): return [ XpathExtractor('thumbnailurl', self.html, "//div[@id='mainpic']//img[@rel='v:image']/@src", postback=self.get_thumbnailurl), XpathExtractor('directors', self.html, "//a[@rel='v:directedBy']/text()", islist=True, postback=split_item), XpathExtractor( 'writers', self.html, u"//div[@id='info']/span/span[text()='编剧']/following-sibling::span/a/text()", islist=True, postback=split_item), XpathExtractor( 'countries', self.html, u"//div[@id='info']/span[text()='制片国家/地区:']/following-sibling::text()[1]", postback=split_item), XpathExtractor( 'languages', self.html, u"//div[@id='info']/span[text()='语言:']/following-sibling::text()[1]", postback=split_item), XpathExtractor( 'releaseDate', self.html, "//div[@id='info']/span[@property='v:initialReleaseDate']/@content", islist=True, postback=self.normalize_releasedate), XpathExtractor( 'runtimes', self.html, "//div[@id='info']/span[@property='v:runtime']/@content"), ValueExtract('kind', None, postback=self.get_kind), ValueExtract('rating', None), ]
def _get_extractors(self): return [ XpathExtractor( 'title', self.html, "//div[@class='moviedteail_tt']/h1/text()" ), XpathExtractor( 'imdbid', self.html, "//ul[@class='moviedteail_list']/li[text()='IMDB:']/a/text()" ), ValueExtract('id', self._get_urlid()), ValueExtract('rdate', self.release_date), ValueExtract('info_url', self.url), ValueExtract('content_urls', None), ValueExtract('md5sum', None, postback=self.calc_md5), ValueExtract('udate', None, postback=self.get_udate), ]
def _get_extractors(self): return [ XpathExtractor('title', self.html, "//div[@class='vshow']/h2/text()"), XpathExtractor( 'akas', self.html, u"string(//div[@class='vshow']/p[contains(text(), '別名:')])", postback=self.split_akas), XpathExtractor('posterurl', self.html, "//div[@class='vpic']/img/@src"), XpathExtractor( 'stars', self.html, u"//div[@class='vshow']/p[contains(text(), '演出:')]/a/text()", islist=True), XpathExtractor( 'genres', self.html, u"//div[@class='vshow']/p[contains(text(), '類型:')]/a/text()", islist=True), XpathExtractor( 'kind', self.html, u"string(//div[@class='vshow']/p[contains(text(), '分類:')])", postback=self.get_kind), XpathExtractor( 'region', self.html, u"string(//div[@class='vshow']/p[contains(text(), '分類:')])", postback=self.get_region), XpathExtractor( 'year', self.html, u"string(//div[@class='vshow']/p[contains(text(), '分類:')])", postback=self.get_year), XpathExtractor( 'completed', self.html, u"string(//div[@class='vshow']/p[contains(text(), '分類:')])", postback=self.get_completed_flag), XpathExtractor( 'update_eps', self.html, u"string(//div[@class='vshow']/p[contains(text(), '分類:')])", postback=self.get_update_ep), XpathExtractor( 'total_eps', self.html, u"string(//div[@class='vshow']/p[contains(text(), '分類:')])", postback=self.get_total_eps), XpathExtractor('description', self.html, "string(//div[@class='vcs']/ul)"), XpathExtractor('rdate', self.html, u"string(//text()[contains(., '更新時間:')])", postback=self.get_rdate), ValueExtract('source', 'kubo'), ValueExtract('id', self.get_id()), ValueExtract('dbid', None), ValueExtract('url', self.url), ValueExtract('play_urls', {}, postback=self.get_play_urls), ]
def _get_extractors(self): return [ XpathExtractor( 'title', self.html, "//span[@class='ratingbutton']/a/@href", postback=self.get_title ), XpathExtractor( 'akas', self.html, "//div[@class='filmTitle']/text()", postback=self.get_akas ), XpathExtractor( 'runtimes', self.html, "string(//ul[@class='runtime'])", postback=self.get_runtimes ), XpathExtractor( 'releaseDate', self.html, "string(//ul[@class='runtime'])", postback=self.get_releaseDate ), ValueExtract( 'directors', self.get_directors() ), ValueExtract( 'writers', self.get_writers() ), ValueExtract( 'stars', self.get_stars() ), XpathExtractor( 'year', self.html, u"//div[@id='filmCastDataBlock']/ul/li/b[text()='影片年份:']/following-sibling::text()[1]", postback=self.get_year ), XpathExtractor( 'thumbnailurl', self.html, "//div[@id='filmTagBlock']/span[1]/a[@class='image Poster']/img/@src", postback=self.remove_default_image ), ValueExtract( 'posterurl', None, postback=self.get_posterurl ), XpathExtractor( 'countries', self.html, u"//div[@id='filmCastDataBlock']/ul/li/b[text()='出 品 國:']/following-sibling::text()[1]", postback=self.split_countries ), XpathExtractor( 'languages', self.html, u"//div[@id='filmCastDataBlock']/ul/li/b[text()='語 言:']/following-sibling::text()[1]", postback=self.split_languages ), XpathExtractor( 'imdbid', self.html, "//div[@id='filmCastDataBlock']/ul/li/a[text()='IMDb']/@href", postback=self.get_imdbid ), XpathExtractor( 'description', self.html, "normalize-space(string(//comment()[contains(.,'Story info start')]/..))", postback=self.get_description ), ValueExtract('source', 'atmovies'), ValueExtract('id', self.get_id()), ValueExtract('kind', 'movie'), ValueExtract('genres', []), ValueExtract('rating', None), ValueExtract('url', self.url), ]
def _get_extractors(self): return [ XpathExtractor('title', self.html, "//h1[@id='title']/text()"), XpathExtractor('posterurl', self.html, "//div[@id='sample-video']//img/@src", postback=self.get_posterurl), XpathExtractor( 'duration', self.html, u"//table[@class='mg-b20']/tr/td[text()='収録時間:']/../td[2]/text()", postback=self.get_duration), XpathExtractor( 'performer', self.html, "//span[@id='performer']/a[contains(@href, '/digital/')]/text()", islist=True, postback=self.get_performers), XpathExtractor( 'category', self.html, u"//table[@class='mg-b20']/tr/td[text()='ジャンル:']/../td[2]/a/text()", islist=True), XpathExtractor('rating', self.html, "//p[@class='d-review__average']/strong/text()", postback=self.get_rating), XpathExtractor( 'description', self.html, "//div[@class='page-detail']/table[@class='mg-b12']/tr/td[1]/div[@class='mg-b20 lh4']/text()" ), XpathExtractor( 'date', self.html, u"//table[@class='mg-b20']/tr/td[text()='商品発売日:']/../td[2]/text()", postback=self.get_date), XpathExtractor( 'date2', self.html, u"//table[@class='mg-b20']/td[normalize-space(text())='配信開始日:']/following-sibling::td/text()", postback=self.get_delivery_date), XpathExtractor( 'samples', self.html, "//div[@id='sample-image-block']/a[@name='sample-image']/img[@class='mg-b6']/@src", islist=True, postback=self.get_samples), XpathExtractor( 'maker', self.html, u"//table[@class='mg-b20']/tr/td[text()='メーカー:']/../td[2]/a/text()" ), XpathExtractor( 'series', self.html, u"//table[@class='mg-b20']/tr/td[text()='シリーズ:']/../td[2]/a/text()" ), ValueExtract('id', self._get_urlid()), ValueExtract('code', None, postback=self.get_code), ValueExtract('url', self.url), ValueExtract('md5sum', None, postback=self.get_md5), ValueExtract('udate', datetime.datetime.utcnow()) ]