def search(meta_info, user_setting): plugin_name = config.get_info('en').get('name') meta_data_list = [] video_title = meta_info.get('video_title') part_file = meta_info.get('part_file') movie_type = '' title_style = '' log('info', 'title:%s' % video_title, plugin_name) # code is the formatted media name which is used to check if it is already in cache code = 'formatted file name' cache_data = check_cache(code, get_info('en').get('name')) if cache_data: meta_data_list.append(cache_data) else: # search for meta data from internet meta_data = MetaData() meta_data_list.append(meta_data) return meta_data_list
def analysis_media_html_byxpath(self, html, q): """ 根据html对象与xpath解析数据 html:<object> html_xpath_dict:<dict> return:<dict{issuccess,ex,dict}> """ ''' xpath_number = "//div[@class='item_register']//table[@class='item']//tr[8]/td[2]/text()" number = html.xpath(xpath_number) if len(number) > 0: number = self.tools.cleanstr(number[0]) media.number = number ''' media = MetaData() number = self.tools.cleanstr(q.upper()) media.number = number xpath_title = "//div[@class='detail_title_new2']/table/tr/td[2]/h1" title = html.xpath(xpath_title) if len(title) > 0: title = self.tools.cleanstr(title[0].text) media.title = title xpath_poster = "//table[@class='item_detail']//tr[1]//td[1]//a//img[@class='item_img']/@src" poster = html.xpath(xpath_poster) if len(poster) > 0: poster = self.tools.cleanstr(poster[0]) media.poster = 'https:%s' % poster media.thumbnail = 'https:%s' % poster xpath_summary = "//table[@class='item_detail']//tr[2]//td[@class='text']//div[@class='item_text']/text()" summary = html.xpath(xpath_summary) if len(summary) > 0: summary = self.tools.cleanstr(summary[1]) media.summary = summary xpath_studio = "//div[@class='item_register']/table[@class='item']//tr[2]/td[2]/a" studio = html.xpath(xpath_studio) if len(studio) > 0: studio = self.tools.cleanstr(studio[0].text) media.studio = studio xpath_directors = "//table[@class='item']//tr[5]//td[2]/a" directors = html.xpath(xpath_directors) if len(directors) > 0: directors = self.tools.cleanstr(directors[0].text) media.directors = directors xpath_collections = "//table[@class='item']//tr[4]//td[2]//a" collections = html.xpath(xpath_collections) if collections[0].text is not None: collections = self.tools.cleanstr(collections[0].text) media.collections = collections xpath_year = "//table[@class='item']//tr[6]/td[2]/text()" year = html.xpath(xpath_year) if len(year) > 0: year = self.tools.cleanstr(year[0]) media.year = self.tools.formatdatetime(year) xpath_originally_available_at = "//table[@class='item']//tr[6]/td[2]/text()" originally_available_at = html.xpath(xpath_originally_available_at) if len(originally_available_at) > 0: originally_available_at = self.tools.cleanstr( originally_available_at[0]) media.originally_available_at = self.tools.formatdatetime( originally_available_at) xpath_category = "//div[@id='adultgenre2']//table//tr/td[2]//ul//li/a" categorys = html.xpath(xpath_category) category_list = [] for category in categorys: category_list.append(self.tools.cleanstr(category.text)) categorys = ','.join(category_list) if len(categorys) > 0: media.category = categorys actor = {} xpath_actor_name = "//div[@class='item_register']//table[@class='item']//tr[1]/td[2]//a" xpath_actor_url = "//div[@class='item_register']//table[@class='item']//tr[1]/td[2]/a/@href" actor_name = html.xpath(xpath_actor_name) actor_url = html.xpath(xpath_actor_url) if len(actor_name) > 0: for i, actorname in enumerate(actor_name): html = self.get_html_byurl('https://www.arzon.jp%s' % actor_url[i]) if html['issuccess']: xpath_actor_image = "//table[@class='p_list1']//img/@src" actorimageurl = html['html'].xpath(xpath_actor_image) actor.update({actorname.text: 'https:%s' % actorimageurl[0]}) media.actor = actor return media
def analysisMediaHtmlByxpath(self, html, q): """ 根据html对象与xpath解析数据 html:<object> html_xpath_dict:<dict> return:<dict{issuccess,ex,dict}> """ media = MetaData() number = self.tools.cleanstr(q.upper()) media.number = number xpath_title = "/html/body/div[2]/div/div[1]/h3" title = html.xpath(xpath_title)[0].text media.title = title summary = title media.summary = summary xpath_poster_url = "//*[@id='slider']/ul[1]/li[1]/img" poster_url = 'https://' + self.basicUrl + html.xpath( xpath_poster_url)[0].attrib['src'] media.poster = poster_url media.thumbnail = poster_url studio = 'FC2' media.studio = studio directors = '' media.directors = directors xpath_collections = "/html/body/div[2]/div/div[1]/h5[3]/a[1]" collections = html.xpath(xpath_collections)[0].text media.collections = collections year = '' media.year = year media.originally_available_at = year xpath_category = "/html/body/div[2]/div/div[1]/h5[6]/a" categorys = html.xpath(xpath_category) category_list = [] for category in categorys: category_list.append(self.tools.cleanstr(category.text)) categorys = ','.join(category_list) if len(categorys) > 0: media.category = categorys xpath_actor_name = "/html/body/div[2]/div/div[1]/h5[5]/a" actor_name = html.xpath(xpath_actor_name)[0].text if actor_name != '': media.actor = actor_name return media
def analysis_media_html_byxpath(self, html, q): """ 根据html对象与xpath解析数据 html:<object> html_xpath_dict:<dict> return:<dict{issuccess,ex,dict}> """ ''' xpath_number = "//div[@class='col-md-3 info']/p[1]/span[2]/text()" number = html.xpath(xpath_number) if len(number) > 0: number = self.tools.cleanstr(number[0]) self.media.number = number ''' media = MetaData() number = self.tools.cleanstr(q.upper()) media.number = number xpath_title = "//div[@class='container']/h3/text()" title = html.xpath(xpath_title) if len(title) > 0: title = self.tools.cleantitlenumber(self.tools.cleanstr(title[0]), number) media.title = title xpath_poster = "//div[@class='col-md-9 screencap']/a[@class='bigImage']/img/@src" poster = html.xpath(xpath_poster) if len(poster) > 0: poster = self.checkUrl + self.tools.cleanstr(poster[0]) media.poster = poster media.thumbnail = poster xpath_studio = "//div[@class='col-md-3 info']/p[5]/a/text()" studio = html.xpath(xpath_studio) if len(studio) > 0: studio = self.tools.cleanstr(studio[0]) media.studio = studio xpath_directors = "//div[@class='col-md-3 info']/p[4]/a/text()" directors = html.xpath(xpath_directors) if len(directors) > 0: directors = self.tools.cleanstr(directors[0]) media.directors = directors xpath_collections = "//div[@class='col-md-3 info']/p[6]/a/text()" collections = html.xpath(xpath_collections) if len(collections) > 0: collections = self.tools.cleanstr(collections[0]) media.collections = collections xpath_year = "/html/body/div[@class='container']/div[@class='row movie']/div[@class='col-md-3 info']/p[2]/text()" year = html.xpath(xpath_year) if len(year) > 0: year = self.tools.cleanstr(year[0]) media.originally_available_at = year media.year = year xpath_category = "/html/body/div[@class='container']/div[@class='row movie']/div[@class='col-md-3 info']/p[8]/span[@class='genre']/a" categorys = html.xpath(xpath_category) category_list = [] for category in categorys: category_list.append(self.tools.cleanstr(category.text)) categorys = ','.join(category_list) if len(categorys) > 0: media.category = categorys actor = {} xpath_actor_name = "/html/body/div[5]/div[1]/div[2]/p/span/a" xpath_actor_url = "//div[@id='star-div']//img/@src" actor_name = html.xpath(xpath_actor_name) actor_url = html.xpath(xpath_actor_url) if len(actor_name) > 0: for i, actorname in enumerate(actor_name): if actor_url[i].find('nowprinting') > 0: actor.update({actorname.text: ''}) else: actor.update( {actorname.text: self.checkUrl + actor_url[i]}) media.actor = actor return media
def analysisMediaHtmlByxpath(self, html, q): """ 根据html对象与xpath解析数据 html:<object> html_xpath_dict:<dict> return:<dict{issuccess,ex,dict}> """ media = MetaData() number = self.tools.cleanstr(q.upper()) media.number = number media.web = 'caribbean' xpath_title = "//*[@id='moviepages']/div/div[1]/div[1]/div[2]/h1" title = html.xpath(xpath_title)[0].text media.title = title xpath_summary = "//*[@id='moviepages']/div/div[1]/div[1]/p" summary = html.xpath(xpath_summary)[0].text media.summary = summary media.poster = 'https://%s/moviepages/%s/images/l_l.jpg' % (self.basicUrl, number) media.thumbnail = 'https://%s/moviepages/%s/images/l_l.jpg' % (self.basicUrl, number) studio = 'Caribbeancom' media.studio = studio directors = '' media.directors = directors collections = 'Caribbeancom' media.collections = collections xpath_year = "//*[@id='moviepages']/div/div[1]/div[1]/ul/li[2]/span[2]" year = html.xpath(xpath_year)[0].text media.year = year media.originally_available_at = year xpath_category = "//*[@id='moviepages']/div/div[1]/div[1]/ul/li[4]/span[2]/a" categorys = html.xpath(xpath_category) category_list = [] for category in categorys: category_list.append(self.tools.cleanstr(category.text)) categorys = ','.join(category_list) if len(categorys) > 0: media.category = categorys xpath_actor_name = "//*[@id='moviepages']/div/div[1]/div[1]/ul/li[1]/span[2]/a/span" actor_name = html.xpath(xpath_actor_name) actor_dict = {} for actor in actor_name: actor_dict[actor.text] = 'https://images.d2pass.com/images/d2p_toolbar/images/d2p_logo.png' media.actor = actor_dict return media
def analysisMediaHtmlByxpath(self, html, q): """ 根据html对象与xpath解析数据 html:<object> html_xpath_dict:<dict> return:<dict{issuccess,ex,dict}> """ media = MetaData() number = self.tools.cleanstr(q.upper()) media.number = number media.web = 'caribbeancompr' xpath_title = "//*[@id='moviepages']/div/div[2]/div[1]/div/div[2]/h1" title = html.xpath(xpath_title)[0].text # if len(title) > 0: # title = self.tools.cleantitlenumber( # self.tools.cleanstr(title[0]), number) media.title = title xpath_summary = "//*[@id='moviepages']/div/div[2]/div[1]/div/p" summary = html.xpath(xpath_summary)[0].text media.summary = summary media.poster = 'https://%s/moviepages/%s/images/l_l.jpg' % ( self.basicUrl, number) media.thumbnail = 'https://%s/moviepages/%s/images/l_l.jpg' % ( self.basicUrl, number) studio = 'Caribbeancompr' media.studio = studio directors = '' media.directors = directors collections = 'Caribbeancompr' media.collections = collections xpath_year = "//li[@class='movie-spec'][2]/span[@class='spec-content']/text()" year = html.xpath(xpath_year) if len(year) > 0: year = self.tools.cleanstr(year[0]) media.year = year media.originally_available_at = year xpath_category = "//li[@class='movie-spec'][5]/span[@class='spec-content']/a/text()" categorys = html.xpath(xpath_category) category_list = [] for category in categorys: category_list.append(self.tools.cleanstr(category)) categorys = ','.join(category_list) if len(categorys) > 0: media.category = categorys actor = {} xpath_actor_name = "//li[@class='movie-spec'][1]/span[@class='spec-content']/a" actor_name = html.xpath(xpath_actor_name) if len(actor_name) > 0: for i, actorname in enumerate(actor_name): actor.update({actorname.text: ''}) media.actor = actor return media
def analysis_media_html_byxpath(self, html, q): """ 根据html对象与xpath解析数据 html:<object> html_xpath_dict:<dict> return:<dict{issuccess,ex,dict}> """ media = MetaData() number = self.tools.cleanstr(q.upper()) media.number = number xpath_title = "//div[@id='main']/h1/text()" title = html.xpath(xpath_title) if len(title) > 0: title = self.tools.cleantitlenumber(self.tools.cleanstr(title[0]), number) media.title = title xpath_summary = "//dd[@class='comment']/div/text()" summary = html.xpath(xpath_summary) if len(summary) > 0: summary = summary[0] media.summary = summary # xpath_poster = "//img/@src" # poster = html.xpath(xpath_poster) # if len(poster) > 0: # poster = self.tools.cleanstr(poster[0]) media.poster = 'https://www.pacopacomama.com/moviepages/%s/images/poster_en.jpg' % number media.thumbnail = 'https://www.pacopacomama.com/moviepages/%s/images/l/1.jpg' % number # xpath_studio = "//div[@class='col-md-3 info']/p[5]/a/text()" # studio = html.xpath(xpath_studio) # if len(studio) > 0: studio = 'PacoPacoMama' media.studio = studio # xpath_directors = "//div[@class='col-md-3 info']/p[4]/a/text()" # directors = html.xpath(xpath_directors) # if len(directors) > 0: directors = '' media.directors = directors # xpath_collections = "//div[@class='col-md-3 info']/p[6]/a/text()" # collections = html.xpath(xpath_collections) # if len(collections) > 0: collections = 'PacoPacoMama' media.collections = collections xpath_year = "//div[@class='movie-info']/dl[3]/dd" year = html.xpath(xpath_year) if len(year) > 0: year = self.tools.cleanstr(year[0].text) media.year = year media.originally_available_at = year xpath_category = "//div[@class='clearfix']/table/tr[4]/td[2]/a/text()" categorys = html.xpath(xpath_category) category_list = [] for category in categorys: category_list.append(self.tools.cleanstr(category)) categorys = ','.join(category_list) if len(categorys) > 0: media.category = categorys actor = {} xpath_actor_name = "//div[@class='clearfix']/table/tr[1]/td[2]/a/text()" xpath_actor_url = "//div[@class='clearfix']/img[@class='lefty']/@src" actor_name = html.xpath(xpath_actor_name) actor_url = 'https://www.pacopacomama.com%s' % html.xpath( xpath_actor_url)[0] if len(actor_name) > 0: for i, actorname in enumerate(actor_name): actorimageurl = actor_url actor.update({self.tools.cleanstr2(actorname): actorimageurl}) # actor.update({self.tools.cleanstr2( # actorname): ''}) media.actor = actor return media
def analysis_media_html_byxpath(self, html, q): """ 根据html对象与xpath解析数据 html:<object> html_xpath_dict:<dict> return:<dict{issuccess,ex,dict}> """ ''' xpath_number = "//div[@class='col-md-3 info']/p[1]/span[2]/text()" number = html.xpath(xpath_number) if len(number) > 0: number = self.tools.cleanstr(number[0]) self.media.number = number ''' media = MetaData() number = 'heyzo-%s' % self.tools.cleanstr(q.upper()) media.number = number xpath_title = "//div[@id='wrapper']/article/section[1]/div[@id='movie']/h1/text()" title = html.xpath(xpath_title) if len(title) > 0: title = self.tools.cleantitlenumber(self.tools.cleanstr(title[0]), number) media.title = title xpath_summary = "//p[@class='memo']/text()" summary = html.xpath(xpath_summary) if len(summary) > 0: summary = summary[0] media.summary = summary media.poster = 'https://www.heyzo.com/contents/3000/%s/images/player_thumbnail.jpg' % q media.thumbnail = 'https://www.heyzo.com/contents/3000/%s/gallery/001.jpg' % q media.studio = 'Heyzo' xpath_collections = "//tr[@class='table-series']/td[2]/text()" collections = html.xpath(xpath_collections) if len(collections) > 0: collections = self.tools.cleanstr(collections[0]) if not collections == '-----': media.collections = collections xpath_year = "//tr[@class='table-release-day']/td[2]/text()" year = html.xpath(xpath_year) if len(year) > 0: year = self.tools.cleanstr(year[0]) media.year = year media.originally_available_at = year xpath_category = "//ul[@class='tag-keyword-list']/li/a/text()" categorys = html.xpath(xpath_category) category_list = [] for category in categorys: category_list.append(self.tools.cleanstr(category)) categorys = ','.join(category_list) if len(categorys) > 0: media.category = categorys actor = {} xpath_actor_name = "//tr[@class='table-actor']/td//a/span/text()" #xpath_actor_url = "//tr[@class='table-actor']/td//a/@href" actor_name = html.xpath(xpath_actor_name) #actor_url = html.xpath(xpath_actor_url) if len(actor_name) > 0: for i, actorname in enumerate(actor_name): # if actor_url[i].find('nowprinting') > 0: # actor.update({actorname: ''}) # else: actor.update({actorname: ''}) media.actor = actor return media
def analysis_media_html_byxpath(self, html, q): """ 根据html对象与xpath解析数据 html:<object> html_xpath_dict:<dict> return:<dict{issuccess,ex,dict}> """ media = MetaData() title = q.upper() media.title = title number = self.tools.cleanstr(q.upper()) media.number = number xpath_poster = "//div[@class='column']/img[@class='image']/@src" poster = html.xpath(xpath_poster) if len(poster) > 0: poster = self.tools.cleanstr(poster[0]) media.poster = poster media.thumbnail = poster xpath_summary = "//p[@class='level has-text-grey-dark']/text()" summary = html.xpath(xpath_summary) if len(summary) > 0: summary = summary[0] media.summary = summary + '' xpath_year = "//p[@class='subtitle is-6']/a/text()" year = html.xpath(xpath_year) if len(year) > 0: year = self.tools.dateconvert(year[0]) media.year = year media.originally_available_at = year xpath_category = "//div[@class='tags']//a/text()" categorys = html.xpath(xpath_category) category_list = [] for category in categorys: category_list.append(self.tools.cleanstr(category)) categorys = ','.join(category_list) if len(categorys) > 0: media.category = categorys actor = {} xpath_actor_name = "//a[@class='panel-block']" actor_name = html.xpath(xpath_actor_name) if len(actor_name) > 0: for i, actorname in enumerate(actor_name): actor.update({actorname.text: ''}) media.actor = actor return media
def analysis_media_html_byxpath(self, browser, q): media = MetaData() browser.get("https://www.1pondo.tv/movies/%s/" % q) btn_xpath = "//button[@class='button-flat button-medium button-icon--right see-more']" btn = browser.find_elements_by_xpath(btn_xpath) if len(btn) == 0: return [] btn[0].click() time.sleep(1) number = self.tools.cleanstr(q.upper()) media.number = number media.web = 'onePondo' # title title_xpath = "//h1[@class='h1--dense']" title = browser.find_elements_by_xpath(title_xpath) media.title = title[0].text summary_xpath = "//div[@class='movie-info section divider']/div[@class='movie-detail']/p" summary = browser.find_elements_by_xpath(summary_xpath) media.summary = summary[0].text media.poster = 'https://www.1pondo.tv/assets/sample/%s/str.jpg' % number media.thumbnail = 'https://www.1pondo.tv/assets/sample/%s/str.jpg' % number media.studio = '一本道' # Collection collection_xpath = "//li[@class='movie-detail__spec'][3]/span[@class='spec-content']" Collection = browser.find_elements_by_xpath(collection_xpath) media.collections = Collection[0].text # datatime datatime_xpath = "//li[@class='movie-detail__spec'][1]/span[@class='spec-content']" datatime = browser.find_elements_by_xpath(datatime_xpath) media.year = datatime[0].text media.originally_available_at = datatime[0].text # types categorys_xpath = "//span[@class='spec-content']/a[@class='spec__tag']" categorys = browser.find_elements_by_xpath(categorys_xpath) categorys_list = [] for item in categorys: categorys_list.append(self.tools.cleanstr(item.text)) categorys = ','.join(categorys_list) if len(categorys) > 0: media.category = categorys # actor actor = {} xpath_actor_name = "//li[@class='movie-detail__spec'][2]/span[@class='spec-content']" actor_name = browser.find_elements_by_xpath(xpath_actor_name) if len(actor_name) > 0: for i, actorname in enumerate(actor_name): actor.update({self.tools.cleanstr2( actorname.text): ''}) media.actor = actor return media
def analysis_media_html_byxpath(self, browser, q): media = MetaData() infos_xpath = "//div[@class='detail_data']" infos = browser.find_elements_by_xpath(infos_xpath) info_list = infos[0].text.split('\n') for info in info_list: tmp = info.split(':') if len(tmp) > 1: keyword = self.tools.cleanstr(tmp[0]) value = tmp[1] if keyword == '出演': # actor actor = {} actor_name = [] actor_name.append(self.tools.cleanstr(value)) if len(actor_name) > 0: for i, actorname in enumerate(actor_name): actor.update( {self.tools.cleanstr2(actorname): ''}) media.actor = actor if keyword == 'メーカー': # 工作室 media.studio = self.tools.cleanstr(value) if keyword == '品番': # 番号 media.number = self.tools.cleanstr(value) if keyword == '配信開始日': # 日期 media.year = self.tools.formatdatetime(self.tools.cleanstr(value)) media.originally_available_at = self.tools.formatdatetime(self.tools.cleanstr(value)) if keyword == 'シリーズ': # 系列 media.collections = self.tools.cleanstr2(self.tools.cleanstr(value)) if keyword == 'ジャンル': # 类型 # types categorys = value.split(' ') while '' in categorys: categorys.remove('') categorys_list = [] for item in categorys: categorys_list.append(self.tools.cleanstr(item)) categorys = ','.join(categorys_list) if len(categorys) > 0: media.category = categorys # title title_xpath = "//h1[@class='tag']" title = browser.find_elements_by_xpath(title_xpath) media.title = self.tools.cleanstr(title[0].text) more_xpath = "//p[@id='introduction_all']" more = browser.find_elements_by_xpath(more_xpath) if not more[0].get_attribute("style") == 'display: none;': more[0].click() summary_xpath = "//p[@class='txt introduction']" summary = browser.find_elements_by_xpath(summary_xpath) media.summary = summary[0].text poster_xpath = "//a[@id='EnlargeImage']" poster = browser.find_elements_by_xpath(poster_xpath) media.poster = poster[0].get_attribute('href') art_xpath = "//div[@class='detail_left']/dl[@id='sample-photo']/dd/ul/li[1]/a[@class='sample_image']" art = browser.find_elements_by_xpath(art_xpath) media.thumbnail = art[0].get_attribute('href') return media
def search(meta_info, user_setting): score = MATCH_SCORE plugin_name = config.get_info('en').get('name') meta_data_list = [] if meta_info.get('media_type') == 'album': if meta_info.get('name'): file_name = meta_info.get('name') score = MATCH_ALBUM else: file_name = clear_file_name(meta_info.get('file_name')) elif meta_info.get('media_type') == 'artist': if meta_info.get('file_name'): file_name = clear_file_name(meta_info.get('file_name')) else: file_name = '' score = MATCH_ARTIST else: return meta_data_list log('info', 'title:%s' % file_name, plugin_name) code = get_code(file_name, meta_info) result = None cache_data = check_cache(code, get_info('en').get('name')) if cache_data: return cache_data else: # search for meta data from internet results = search_song_by_code(code) sort_result(results, code) highest_result = get_highest_result(results) if not highest_result: return meta_data_list if highest_result.get('score') >= score: result = highest_result if not result: return meta_data_list get_artist_info(result) get_album_info(result) get_hot_comment(result) meta_data = MetaData() # Album album = Album() album.title = result.get('album').get('name') if result.get('album').get('description'): album.summary = result.get('album').get('description') album.studio = result.get('album').get('company') album.tags = result.get('album').get('tags') album.collections = result.get('album').get('subType') album.poster = get_picture_base64(result.get('album').get('picUrl')) album.originally_available_at = datetime.datetime.fromtimestamp( result.get('album').get('publishTime') / 1000).strftime('%Y-%m-%d') for review in result['reviews'].get('topComments'): album.reviews.append( review.get('user').get('nickname') + ': ' + review.get('content')) for review in result['reviews'].get('hotComments'): album.reviews.append( review.get('user').get('nickname') + ': ' + review.get('content')) if user_setting.get('hotComment'): album.summary += '\n 网易云热门评论: ' for review in album.reviews: album.summary += '\n ' + review.replace('\n', '') meta_data.album = album # Artist for result_artist in result.get('artists'): artist = Artist() artist.poster = get_picture_base64(result_artist.get('cover')) artist.art = get_picture_base64(result_artist.get('cover')) title = result_artist.get('name') if title == 'Various Artists' or title == '[Unknown Artist]': title = '未知艺术家' artist.title = title if result_artist.get('identifyTag'): artist.tags = ','.join(result_artist.get('identifyTag')) artist.summary = '' if result_artist.get('briefDesc'): artist.summary = result_artist.get('briefDesc') if result_artist.get('rank'): rank = result_artist.get('rank') rank_type = ['', '华语', '欧美', '韩国', '日本'] rank_string = '\n歌手排行:' + str( rank_type[rank.get('type')]) + '地区:' + str(rank.get('rank')) artist.summary += rank_string if result_artist.get('albumSize'): artist.summary += '\n歌手专辑数:' + str(result_artist.get('albumSize')) if result_artist.get('musicSize'): artist.summary += '\n歌手音乐数:' + str(result_artist.get('musicSize')) if result_artist.get('mvSize'): artist.summary += '\n歌手MV数:' + str(result_artist.get('mvSize')) meta_data.artist.append(artist) meta_data.code = code meta_data_list.append(meta_data) return meta_data_list
def analysis_media_html_byxpath(self, browser, q): """ 根据html对象与xpath解析数据 html:<object> html_xpath_dict:<dict> return:<dict{issuccess,ex,dict}> """ codeList = [] imgnumber = '' re_list = re.finditer( r'[0-9]{4}\D[0-9]{1,5}|[0-9]{4}\D(Q|q)[0-9]{1,5}|[0-9]{4}\D(.{3})\D[0-9]{4}|[0-9]{4}\D(.{3})\D[0-9]{6}\D[0-9]{3}', q, re.IGNORECASE) for item in re_list: imgnumber = item.group() codeList.append(item.group()) browser.get('https://www.heydouga.com/moviepages/%s/index.html' % q) media = MetaData() media.number = q.replace('/', '-') xpath_title = "//div[@id='title-bg']/h1" title = browser.find_elements_by_xpath(xpath_title)[0].text if len(title) > 0: title = self.tools.cleanstr(title) media.title = title xpath_summary = "//div[@class='movie-description']/p" summary = browser.find_elements_by_xpath(xpath_summary)[0].text if len(summary) > 0: summary = self.tools.cleanstr(summary) media.summary = summary media.poster = 'https://www.heydouga.com/contents/%s/player_thumb.jpg' % self.format( imgnumber) media.thumbnail = 'https://www.heydouga.com/contents/%s/player_thumb.jpg' % self.format( imgnumber) #xpath_studio = "//div[@class='col-md-3 info']/p[5]/a/text()" #studio = html.xpath(xpath_studio) # if len(studio) > 0: #studio = self.tools.cleanstr(studio[0]) media.studio = 'heydouga' # xpath_directors = "//div[@class='col-md-3 info']/p[4]/a/text()" # directors = html.xpath(xpath_directors) # if len(directors) > 0: # directors = self.tools.cleanstr(directors[0]) # media.directors = directors # xpath_collections = "//div[@class='col-md-3 info']/p[6]/a/text()" # collections = html.xpath(xpath_collections) # if len(collections) > 0: # collections = self.tools.cleanstr(collections[0]) media.collections = 'heydouga' xpath_year = "//div[@id='movie-info']//li[1]/span[2]" year = browser.find_elements_by_xpath(xpath_year)[0].text if len(year) > 0: media.year = year media.originally_available_at = year xpath_category = "//ul[@id='movie_tag_list']/li/a" categorys = browser.find_elements_by_xpath(xpath_category) category_list = [] for category in categorys: category_list.append(self.tools.cleanstr(category.text)) categorys = ','.join(category_list) if len(categorys) > 0: media.category = categorys actor = {} xpath_actor_name = "//div[@id='movie-info']/ul/li[2]/span[2]/a" #xpath_actor_url = "//div[@id='star-div']//img/@src" actor_name = browser.find_elements_by_xpath(xpath_actor_name) #actor_url = html.xpath(xpath_actor_url) if len(actor_name) > 0: actor_names = actor_name[0].text.split(' ') for actorname in actor_names: # if actor_url[i].find('nowprinting') > 0: # actor.update({actorname: ''}) # else: actor.update({actorname: ''}) media.actor = actor return media
def analysis_media_html_byxpath(self, html, q): """ 根据html对象与xpath解析数据 html:<object> html_xpath_dict:<dict> return:<dict{issuccess,ex,dict}> """ media = MetaData() number = self.tools.cleanstr(q.upper()) media.number = number media.web = 'javr' studio_text = '' xpath_p = "//div[@class='post-metadata']/p" p_list = html.xpath(xpath_p) for i in range(len(p_list)): lab = html.xpath('%s[%s]/b/text()' % (xpath_p, (i + 1))) if lab[0] == 'Studio:': studio = html.xpath('%s[%s]//text()' % (xpath_p, (i + 1)))[2] xpath_title = "//h1[@class='entry-title1']/text()" title = html.xpath(xpath_title) title = title[0].replace('Watch XXX Japanese P**n - ', '').replace(studio, '') media.title = title media.summary = title xpath_poster = "//img[@id='myvidcover']/@src" post_url_list = html.xpath(xpath_poster) for post_url in post_url_list: if len(re.findall('data:image', post_url)) < 1: media.poster = post_url media.thumbnail = post_url media.studio = studio directors = '' media.directors = directors xpath_category = "//div[@class='categories tags cactus-info']/a/text()" categorys = html.xpath(xpath_category) category_list = [] for category in categorys: category_list.append(self.tools.cleanstr(category)) categorys = ','.join(category_list) if len(categorys) > 0: media.category = categorys actor = {} xpath_actor_name = "//div[@class='channel-content']//a/h4/text()" xpath_actor_url = "//div[@class='post-metadata sp-style style-5']//a/img/@data-src" actor_name = html.xpath(xpath_actor_name) actor_url = html.xpath(xpath_actor_url) if len(actor_name) > 0: for i, actorname in enumerate(actor_name): try: actor.update({actorname: actor_url[i]}) except Exception as ex: actor.update({ actorname: 'https://ravecloud.xyz/2019/02/javraveclublogo_41.png' }) media.actor = actor return media
def analysis_media_html_byxpath(self, html, q): """ 根据html对象与xpath解析数据 html:<object> html_xpath_dict:<dict> return:<dict{issuccess,ex,dict}> """ number = self.tools.cleanstr(q.upper()) media = MetaData() xpath_title = "//dl[@class='list-spec cf']/dd[1]/text()" title = html.xpath(xpath_title) if len(title) > 0: title = self.tools.cleantitlenumber( self.tools.cleanstr(title[0]), number) media.title = title xpath_summary = "//div[@class='detail-info__item'][2]/p[@class='detail-info__comment']/text()" summary = html.xpath(xpath_summary) if len(summary) > 0: summary = summary[0] media.summary = summary # xpath_poster = "//img/@src" # poster = html.xpath(xpath_poster) # if len(poster) > 0: # poster = self.tools.cleanstr(poster[0]) media.poster = 'https://www.10musume.com/moviepages//%s/images/list1.jpg' % number media.thumbnail = 'https://www.10musume.com/moviepages//%s/images/g_b001.jpg' % number # xpath_studio = "//div[@class='col-md-3 info']/p[5]/a/text()" # studio = html.xpath(xpath_studio) # if len(studio) > 0: studio = '素人専門アダルト動画' media.studio = studio # xpath_directors = "//div[@class='col-md-3 info']/p[4]/a/text()" # directors = html.xpath(xpath_directors) # if len(directors) > 0: directors = '' media.directors = directors # xpath_collections = "//div[@class='col-md-3 info']/p[6]/a/text()" # collections = html.xpath(xpath_collections) # if len(collections) > 0: collections = '天然むすめ' media.collections = collections xpath_year = "//dl[@class='list-spec cf']/dd[2]/text()" year = html.xpath(xpath_year) if len(year) > 0: year = self.tools.cleanstr(year[0]) self.media.year = year self.media.originally_available_at = year xpath_category = "//dl[@class='list-spec cf']/dd[7]/a/text()" categorys = html.xpath(xpath_category) category_list = [] for category in categorys: category_list.append(self.tools.cleanstr(category)) categorys = ','.join(category_list) if len(categorys) > 0: media.category = categorys actor = {} xpath_actor_name = "//dl[@class='list-spec cf']/dd[4]/a/text()" # xpath_actor_url = "//div[@class='video-performer']/a/img/@style" actor_name = html.xpath(xpath_actor_name) # actor_url = html.xpath(xpath_actor_url) if len(actor_name) > 0: for i, actorname in enumerate(actor_name): # actorimageurl = actor_url[i].replace('background-image:url(', '').replace(');', '') ''' actor.update({self.tools.cleanstr2( actorname): actorimageurl}) ''' actor.update({self.tools.cleanstr2( actorname): ''}) media.actor = actor return media