def _subtitles_urls(self, re_name, re_sub, re_lang, date=None, url=None): if url and not self.browser.open(url): return for tr in self.browser.cssselect('table tr', []): links = tr.cssselect('.a1 a') if not links: continue spans = links[0].cssselect('span') if len(spans) < 2: continue if not re_lang.search(clean(spans[0].text)): continue title = clean(spans[1].text) if re_sub and not re_sub.search(title): continue yield urljoin(self.url, links[0].get('href')) uls = self.browser.cssselect('.box ul') if uls: for li in uls[-1]: links = li.cssselect('.title a') if not links: continue for link in links: title = link.text if not re_name.search(title): continue date_ = self._get_date(title) if date and date_ and abs(date - date_) > 1: continue url = urljoin(self.url, link.get('href')) for res in self._subtitles_urls(re_name, re_sub, re_lang, date, url): yield res
def _get_info(self, artist, pages_max): url = self._get_artist_url(artist) if not url: return self.browser.open(url) info = { 'name': clean(artist, 1), 'url': url, 'genre': [], 'albums': [], } for tag in self.browser.cssselect('.tags li a', []): if not RE_MORE_TAGS.search(tag.text): info['genre'].append(clean(tag.text)) # Get albums links = self.browser.cssselect('.artist-top-albums a') if not links: logger.debug('failed to find albums link for "%s" at %s', artist, url) return info elif not RE_ALBUMS.search(links[0].text): return url_albums = urljoin(self.url, links[0].get('href')) for info_album in self._artist_albums(url_albums, pages_max): info_album['genre'] = info['genre'] info['albums'].append(info_album) return info
def get_searches(query): parts = [v.strip() for v in query.split(',')] if len(parts) < 2: return [] category, is_artist = get_category_info(clean(parts.pop(0))) if category is None: return [] name = clean(parts.pop(0), 1) if not name: return [] artist = name if is_artist else None langs = Settings.get_settings('media_langs').get(category, []) search = { 'name': name, 'category': category, 'mode': 'once', 'langs': langs, } if category == 'music': if not parts: artist = name if artist: try: return get_music_searches(artist) except InfoError, e: raise QueryError('failed to find albums from "%s": %s', artist, str(e)) search['album'] = clean(parts.pop(0), 1) if not search['album']: raise QueryError('failed to parse query "%s": album name is missing', query)
def get_info(self, query, category, artist=None): re_cat = CAT_DEF.get(category) if not re_cat: logger.error('unknown category %s', category) return if not self.browser.submit_form(self.url, fields={'search_term': query}): return info = {} re_q = Title(query).get_search_re() re_artist = Title(artist).get_search_re() if artist else None for li in self.browser.cssselect('.search_results li.result', []): log = html.tostring(li, pretty_print=True)[:1000] type_ = li.cssselect('.result_type') if not type_: logger.error('failed to get type from %s', log) continue if not re_cat.search(clean(type_[0][0].text, 1)): continue title_ = li.cssselect('.product_title a') if not title_: logger.error('failed to get title from %s', log) continue info['title'] = clean(title_[0].text, 1) if not re_q.search(info['title']): continue info['url'] = urljoin(self.url, title_[0].get('href')) scores = [] rating_ = li.cssselect('.metascore') if rating_: try: scores.append(int(rating_[0].text)) except ValueError: if not RE_NA_SCORE.search(rating_[0].text): logger.error('failed to get metascore from "%s"', log) rating_ = li.cssselect('.textscore') if rating_: try: scores.append(int(float(rating_[0].text) * 10)) except ValueError: if not RE_NA_SCORE.search(rating_[0].text): logger.error('failed to get user score from %s', html.tostring(rating_[0])) if scores: info['rating'] = sum(scores) / len(scores) info.update(self._get_media_info(info['url'])) if re_artist and not re_artist.search(info.get('artist', '')): continue return info
def get_track(self, artist, album): artist = clean(artist) album = clean(album) re_title = Title(artist).get_search_re(mode='__all__') for result in self.results('%s %s' % (artist, album)): if not result['title'] or not result['url_watch'] or not result['urls_thumbnails']: continue if re_title.search(result['title']): return result
def _get_name_url_info(self, url): if not self.browser.open(url): return info = { 'url': url, 'titles_known_for': [], } # Get "known for" titles for div in self.browser.cssselect('div#knownfor > div', []): links = div.cssselect('a') if not links: continue title = links[-1].text res = RE_TITLE.search(title) if not res: logger.error('failed to get title and date from "%s"', title) continue title, date = res.groups() info['titles_known_for'].append({ 'title': clean(title, 1), 'date': int(date), 'url': urljoin(self.url, links[-1].get('href')), }) # Get filmography for category, el_id in [ ('titles_director', 'filmo-head-director'), ('titles_actor', 'filmo-head-actor'), ]: info.setdefault(category, []) for el in self.browser.cssselect('#%s + .filmo-category-section div' % el_id): links = el.cssselect('a') if not links: continue text = ''.join(el.xpath("text()")) if RE_TITLES_EXCL.search(text): continue title = { 'title': clean(links[0].text, 1), 'url': urljoin(self.url, links[0].get('href')), } els = el.cssselect('.year_column') if els: res = RE_DATE.findall(els[0].text) if res: title['date'] = int(res[0]) info[category].append(title) return info
def _has_unrelated(self, name, path): '''Check unrelated media in the given directory. ''' size_limit = get_size(self.file) / 10.0 for file in files(path, types=self.TYPES): if file.file == self.file: continue name_ = file.get_file_info().get('display_name') if not name_ or clean(name_, 9) != clean(name, 9): if file.type == self.type == 'video' and get_size(file.file) < size_limit: continue return True
def _releases(self, type): url = URLS.get(type) if not url: logger.error('unhandled release type "%s"', type) return self.browser.open(url) now = datetime.utcnow() year = now.year for li in self.browser.cssselect('li.product', []): log = html.tostring(li, pretty_print=True)[:1000] info = {} title_ = li.cssselect('.product_title a') if not title_: continue info['title'] = clean(title_[0].text, 1) info['url'] = urljoin(self.url, title_[0].get('href')) if type.startswith('music_'): artist_ = li.cssselect('.product_artist .data') if not artist_: continue info['artist'] = clean(artist_[0].text, 1) rating_ = li.cssselect('.metascore') if not rating_: continue try: info['rating'] = int(rating_[0].text) except ValueError: if not RE_NA_SCORE.search(rating_[0].text): logger.error('failed to get rating from "%s"', log) continue date_ = li.cssselect('.release_date .data') if not date_: continue res = RE_DATE.search(date_[0].text) if not res: logger.error('failed to get date from "%s"', log) continue date_str = '%s %s %02d' % (year, res.group(1).lower(), int(res.group(2))) date = datetime.strptime(date_str, '%Y %b %d') if date > now: date = datetime(date.year - 1, date.month, date.day) info['date'] = date yield info
def get_trailer(self, title, date=None): title = clean(title) re_title = Title(title).get_search_re(mode='__all__') queries = ['%s trailer' % title, title] if date: queries.insert(0, '%s %s trailer' % (title, date)) for query in queries: for result in self.results(query): if not re_title.search(clean(result['title'])): continue if result['url_watch'] and result['urls_thumbnails']: return result
def _get_media_info(self, url): browser = Browser() browser.open(url) info = {} band_ = browser.cssselect('.band_name') if band_: info['artist'] = clean(band_[0].text, 1) genre_ = browser.cssselect('.product_genre .data') if genre_: info['genre'] = [clean(g, 1) for g in genre_[0].text.split(',')] return info
def get_similar(self, artist): '''Get similar artists. ''' res = [] url = self._get_band_url(artist) if url: for tag in self.browser.cssselect('p.alt2', []): if clean(tag[0][0].text, 1) == 'similar bands': for tag_ in tag[1:]: res.append({ 'name': clean(tag_.text, 1), 'url': urljoin(self.url, tag_.get('href')) }) break return res
def reviews(self): if not self.url: return url = self._get_reviews_url() if not url: logger.error('failed to get reviews url at %s', self.url) return self.browser.open(url) for td in self.browser.cssselect('tr.alt1 td', []): log = html.tostring(td, pretty_print=True)[:1000] info = {} links = td.cssselect('a') if not links: logger.error('failed to get release from %s', log) continue try: info['artist'] = clean(links[1][0][0].text, 1) except Exception: logger.error('failed to get artist from %s', log) continue try: info['album'] = clean(links[1][0][-1].text, 1) except Exception: logger.error('failed to get album from %s', log) continue try: info['rating'] = float(td[-1][-1].text) except Exception: continue try: y, m, d = RE_DATE_REVIEW.search(td[-1].text).groups() info['date'] = datetime(int(y), int(m), int(d)) except Exception: logger.debug('failed to get date from %s', log) continue try: info['url_review'] = urljoin(self.url, links[0].get('href')) except Exception: logger.error('failed to get review url from %s', log) try: info['url_thumbnail'] = urljoin(self.url, links[0][0].get('src')) except Exception: logger.error('failed to get thumbnail url from %s', log) yield info
def get_info(self, query): if not self.browser.submit_form(self.url, fields={'search': query}): return info = {} re_q = Title(query).get_search_re() for li in self.browser.cssselect('#movie_results_ul li', []): log = html.tostring(li, pretty_print=True)[:1000] rating_ = li.cssselect('.tMeterScore') if not rating_: continue res = RE_RATING.search(rating_[0].text) if not res: logger.error('failed to get rating from "%s"', log) continue info['rating'] = int(res.group(1)) title_ = li.cssselect('.nomargin a') if not title_: logger.error('failed to get title from %s', log) continue info['title'] = clean(title_[0].text, 1) if not re_q.search(info['title']): continue info['url'] = urljoin(self.url, title_[0].get('href')) url = self._get_thumbnail_url(info['url']) if url: info['url_thumbnail'] = url return info
def _get_filename(remote): if remote: data = remote.info().get('Content-Disposition') if data: res = RE_CONTENT_FILENAME.findall(data) if res: return clean(res[0])
def results(self, query, pages_max=1): for page in range(1, pages_max + 1): if page > 1: if not self._next(page): break else: self.browser.submit_form(self.url, fields={'q': query}) for li in self.browser.cssselect('li.g', []): log = html.tostring(li, pretty_print=True)[:1000] links = li.cssselect('a') if not links: logger.error('failed to get links from %s', log) continue url = links[0].get('href') if not url or not urlparse(url).scheme: continue title = clean(self.get_link_text(html.tostring(links[0]))) if not title: continue yield { 'title': title, 'url': url, 'page': page, }
def releases(self): for release_type, re_release in RE_RELEASES_URLS.items(): if not self.browser.follow_link(text_regex=re_release): logger.error('failed to get %s releases', release_type) continue for item in self.browser.cssselect('.list_item', []): log = html.tostring(item, pretty_print=True)[:1000] link_ = item.cssselect('.info a') if not link_: logger.error('failed to get link from %s', log) continue result = { 'title': clean(link_[0].text, 1), 'url': urljoin(self.url, link_[0].get('href')), } rating_ = item.cssselect('.rating-rating .value') if not rating_: logger.error('failed to get rating from %s', log) continue try: result['rating'] = float(rating_[0].text) except ValueError: logger.error('failed to get rating from %s', log) pass yield result
def _subtitles_urls(self, re_name, date=None, url=None): if url and not self.browser.open(url): return trs = self.browser.cssselect('#search_results tr[id]') if not trs: if not self.browser.cssselect('#search_results'): # skip tvshow whole season page yield self.browser.geturl() return for tr in trs: links = tr.cssselect('a') if not links: continue title = clean(links[0].text) if not re_name.search(title): continue date_ = self._get_date(title) if date and date_ and abs(date - date_) > 1: continue url = urljoin(self.url, links[0].get('href')) for res in self._subtitles_urls(re_name=re_name, date=date, url=url): yield res
def create_similar(): data = request.json if not data.get('recurrence'): return jsonify(error='missing recurrence') if 'id' in data: id = ObjectId(data['id']) type = data.get('type') search = _get_object_search(id, type) if not search: return jsonify(error='%s %s does not exist' % (type, id)) similar = { 'name': search['name'], 'category': search['category'], } else: if not data.get('name'): return jsonify(error='missing name') if not data.get('category'): return jsonify(error='missing category') similar = { 'name': clean(data['name'], 1), 'category': data['category'], } similar['recurrence'] = int(data['recurrence']) similar['langs'] = data.get('langs') or [] if not SimilarSearch.add(**similar): return jsonify(error='failed to create similar %s' % similar) return jsonify(result=True)
def _next(self, page): for link in self.browser.cssselect('#nav a'): try: page_ = int(clean(self.get_link_text(html.tostring(link)))) except ValueError: continue if page_ == page: url = urljoin(self.url, link.get('href')) return self.browser.open(url)
def _get_title_url_info(self, url): if not self.browser.open(url): return info = {'url': url} headers = self.browser.cssselect('.header') if not headers: logger.error('failed to get title from %s', url) return titles = headers[0].cssselect('[itemprop="name"]') if not titles: return info['title'] = clean(titles[0].text, 1) dates = headers[0].cssselect('.nobr') if dates: res = RE_DATE.search(clean(html.tostring(dates[0]), 1)) if res: info['date'] = int(res.group(1)) res = self.browser.cssselect('#img_primary img') if res: info['url_thumbnail'] = res[0].get('src') res = self.browser.cssselect('div.star-box-giga-star') if res: info['rating'] = float(clean(res[0].text)) res = self.browser.cssselect('.infobar') if res: info['details'] = clean(res[0].text, 1) tags = self.browser.cssselect('div.txt-block', []) + self.browser.cssselect('div.inline', []) for tag in tags: if tag is None or not len(tag): continue title = clean(tag[0].text, 1) if title.startswith('director'): info['director'] = [clean(a.text, 1) for a in tag.cssselect('a span') if not RE_NAMES_EXCL.search(a.text)] elif title == 'stars': info['stars'] = [clean(a.text, 1) for a in tag.cssselect('a span') if not RE_NAMES_EXCL.search(a.text)] elif title == 'country': info['country'] = [clean(a.text, 1) for a in tag.cssselect('a')] elif title == 'genres': info['genre'] = [clean(a.text, 1) for a in tag.cssselect('a')] elif title == 'runtime': info['runtime'] = tag[1].text return info
def results(self, query): yt_query = gdata.youtube.service.YouTubeVideoQuery() yt_query.vq = clean(query) yt_query.orderby = 'relevance' yt_query.racy = 'include' try: feed = self.yt_service.YouTubeQuery(yt_query) except Exception, e: logger.error('failed to process query "%s": %s', query, str(e)) return
def get_results_count(self, query): '''Get the results count for a query. ''' self.browser.submit_form(self.url, fields={'q': query}) stat = self.browser.cssselect('#resultStats') if stat: res = RE_NB_RESULTS.findall(clean(stat[0].text)) if res: nb = re.sub(r'\D+', '', res[0]) return int(nb)
def get_query(query, category=None): query = clean(query, 1) if category == 'tv': query = Title(query).name elif category == 'anime': query = Title(query).display_name query = re.sub(r'[\W_]+|\s+s\s+|\sand\s|\sor\s|\snot\s', ' ', query) query = re.sub(r'^the\s+|^[\W_]+|[\W_]+$', '', query) return query
def get_info(file): '''Get main info by category. ''' res = {} for cat, info in parse(file).items(): if cat == 'general': try: res['duration'] = int(info.get('duration')) / 1000 # seconds except Exception: pass try: res['bitrate'] = int(info.get('overall bit rate')) # bps except Exception: pass # Tags res['artist'] = clean(info.get('performer', ''), 1) res['album'] = clean(info.get('album', ''), 1) try: res['date'] = int(info.get('recorded date')) except Exception: pass res['title'] = clean(info.get('track name', ''), 1) try: res['track_number'] = int(info.get('track name/position')) except Exception: pass else: if cat == 'audio #1': cat = 'audio' try: res['%s_bitrate' % cat] = int(info.get('bit rate')) # bps except Exception: pass res['%s_codec' % cat] = info.get('codec') res['%s_codec_id' % cat] = info.get('codec id') return res
def _artist_albums(self, url, pages_max): for i in range(pages_max): if i > 0: url = self._get_next_page_url() if not url: return self.browser.open(url) for tag in self.browser.cssselect('.album-item', []): log = html.tostring(tag, pretty_print=True)[:1000] meta_tags = tag.cssselect('[itemprop="name"]') if not meta_tags: continue title = clean(meta_tags[0].get('content', ''), 1) if not title: continue info_album = {'title': title} url_tags = tag.cssselect('a') if url_tags: info_album['url'] = urljoin(self.url, url_tags[0].get('href')) else: logger.error('failed to get album url from %s', log) url_thumbnails = tag.cssselect('.album-item-cover img') if url_thumbnails: url_ = url_thumbnails[0].get('src') if not RE_THUMBNAIL_UNKNOWN.search(urlparse(url_).path): info_album['url_thumbnail'] = url_ else: logger.error('failed to get album thumbnail url from %s', log) date_tags = tag.cssselect('time') if not date_tags: continue try: date = RE_DATE_ALBUM.search(date_tags[0].get('datetime')) info_album['date'] = int(date.group(1)) except Exception: continue # Check nb tracks tracks_tags = tag.cssselect('[itemprop="numTracks"]') if not tracks_tags: continue try: nb_tracks = int(tracks_tags[0].text) except ValueError: continue if nb_tracks < MIN_ALBUM_TRACKS: continue yield info_album
def _get_artist_url(self, artist): url = self._get_results_url(artist) if not url: return re_name = Title(artist).get_search_re() self.browser.open(url) for tag in self.browser.cssselect('.artistsWithInfo li', []): links = tag.cssselect('a') if links: name = clean(self.get_link_text(html.tostring(links[0]))) if re_name.search(name): return urljoin(self.url, self._clean_url(links[0].get('href')))
def _get_torrent_url(self, query, url): re_q = Title(query).get_search_re(mode='__lazy__') for mirror_url in self._mirror_urls(url): for torrent_url in self._torrent_urls(mirror_url): res = parse_magnet_url(torrent_url) if not res or not 'dn' in res: continue title = clean(res['dn'][0]) if re_q.match(title): return torrent_url
def get_file_info(self): '''Get the file info. ''' info = get_info(self.file) if info: info['full_name'] = '%s%s%s' % (info['artist'], ' ' if info['artist'] and info['album'] else '', info['album']) info['display_name'] = '%s%s%s' % (info['artist'], ' - ' if info['artist'] and info['album'] else '', info['album']) if info.get('date'): info['display_name'] = '%s%s%s' % (info['display_name'], ' - ' if info['display_name'] else '', info['date']) info['subtype'] = 'music' if not info.get('display_name'): info['display_name'] = clean(self.dir, 1) return info
def _get_urls(self, query, type='title'): urls = [] self.browser.addheaders = [('Accept-Language', 'en-US,en')] if self.browser.submit_form(self.url, fields={'q': query}): url = self.browser.geturl() if RE_URLS[type].search(url): urls = [url] else: re_name = Title(query).get_search_re() for res in self.browser.cssselect('.result_text a', []): if not re_name.search(clean(res.text)): continue url = urljoin(self.url, res.get('href')) if not RE_URLS[type].search(url): continue urls.append(url) return urls
def _similar_artists(self, url, pages_max): for i in range(pages_max): if i > 0: url = self._get_next_page_url() if not url: return self.browser.open(url) for li in self.browser.cssselect('.similar-artists li', []): links = li.cssselect('a') if not links: continue names = li.cssselect('.link-reference h3') if not names: continue yield { 'name': clean(names[0].text, 1), 'url': urljoin(self.url, links[0].get('href')), }