def artist_musicbrainz_id( artist_id, artist_mbid ): artist_details = retrieve_artist_details( artist_id ) artist = [] if not artist_details["musicbrainzartistid"] or not artist_mbid: name, artist["musicbrainz_artistid"], sortname = get_musicbrainz_artist_id( get_unicode( artist_details["label"] ) ) artist[ "name" ] = get_unicode( artist_details[ "label" ] ) else: artist[ "name" ] = get_unicode( artist_details["label"] ) if artist_mbid: artist[ "musicbrainz_artistid" ] = artist_mbid else: artist[ "musicbrainz_artistid" ] = artist_details["musicbrainzartistid"] return artist
def get_records(self): """Read all rows of workbook and parse them looking to build database records from them. Yield records as soon as they are built.""" # load active sheet of wb ws = self.wb.get_active_sheet() # create AbbyParser instance ap = AbbyParser(self.parsers, self.context, self.record_builder) # iterate through all abby_file wb rows for row in ws.iter_rows(): cells_values = [] # iterate through all cells in row for cell in row: # add cell value cells_values.append(get_unicode(cell.internal_value)) # checks if list of cell values not empty if not self._empty(cells_values): # remove any empty cell that might be at the end of row cells_values = self._remove_lasts_none(cells_values) # parse all posible records from row (list of cell values) record_lines = ap.parse_row(cells_values) # yields any record built from row for record in record_lines: yield record
def set_imgs(self, imgs): """The motive for this method is the same as above, provide APIs for both `article.imgs` and `article.images` """ imgs = [get_unicode(i) for i in imgs] self.images = imgs self.imgs = imgs
def set_top_img_no_check(self, src_url): """Provide 2 APIs for images. One at "top_img", "imgs" and one at "top_image", "images" """ src_url = get_unicode(src_url) self.top_img = src_url self.top_image = src_url
def set_title(self, title): if self.title and not title: # Title has already been set by an educated guess and # <title> extraction failed return title = title[:self.config.MAX_TITLE] self.title = get_unicode(title)
def set_authors(self, authors): """Authors are in ["firstName lastName", "firstName lastName"] format """ if not isinstance(authors, list): raise Exception("authors input must be list!") if authors: authors = authors[:self.config.MAX_AUTHORS] self.authors = [get_unicode(author) for author in authors]
def set_keywords(self, keywords): """Keys are stored in list format """ if not isinstance(keywords, list): raise Exception("Keyword input must be list!") if keywords: self.keywords = [ get_unicode(k) for k in keywords[:self.config.MAX_KEYWORDS] ]
def _normalize_product_units(self, product_units): """Normalize string for "kilogramos" unit, if that is the product_unit passed.""" RV = product_units # uses the long form of the unit if get_unicode(product_units.strip()) == u"Kg.": RV = u"kilogramos" return RV
def fromstring(cls, html): html = utils.get_unicode(html, is_html=True) # Enclosed in a `try` to prevent bringing the entire library # down due to one article (out of potentially many in a `Source`) try: # Remove encoding tag because lxml won't accept it for # unicode objects (Issue #78) if html.startswith('<?'): html = re.sub(r'^\<\?.*?\?\>', '', html, flags=re.DOTALL) cls.doc = lxml.html.fromstring(html) return cls.doc except Exception: traceback.print_exc() return None
def parse(self): """Merge both parts of head1 row and call Head1Parser.""" # forma la row completa de tipo Head1 tbl_head1_row = [get_unicode(self.context.last_row + u" " + self.row[0])] tbl_head1_row = [tbl_head1_row[0].strip()] # check that row is accepted and use parser if Head1Parser(tbl_head1_row).accepts(): Head1Parser(tbl_head1_row, self.context).parse() # if not accepted, print an error else: print tbl_head1_row print "Ocurrio un error con un Head1Parser partido!" # declara cual fue el ultimo row type procesado self.context.row_type = "tbl_head1_final_part"
def fulltext(html, language='en'): """Takes article HTML string input and outputs the fulltext Input string is decoded via UnicodeDammit if needed """ from .cleaners import DocumentCleaner from .configuration import Configuration from .extractors import ContentExtractor from .outputformatters import OutputFormatter config = Configuration() config.language = language extractor = ContentExtractor(config) document_cleaner = DocumentCleaner(config) output_formatter = OutputFormatter(config) doc = config.get_parser().fromstring(html) doc = document_cleaner.clean(doc) top_node = extractor.calculate_best_node(doc) top_node = extractor.post_cleanup(top_node) text, article_html = output_formatter.get_formatted(top_node) return get_unicode(text)
def set_summary(self, summary): """Summary here refers to a paragraph of text from the title text and body text """ summary = summary[:self.config.MAX_SUMMARY] self.summary = get_unicode(summary)
def get_musicbrainz_album( album_title, artist, e_count, limit=1, with_singles=False, by_release=False, use_alias=False, use_live=False ): """ Retrieves information for Album from MusicBrainz using provided Album title and Artist name. Use: album, albums = get_musicbrainz_album( album_title, artist, e_count, limit, with_singles, by_release ) album_title - the album title(must be unicode) artist - the artist's name(must be unicode) e_count - used internally(should be set to 0) limit - limit the number of responses with_singles - set to True to look up single releases at the same time by_release - use release name for search """ match_within = "~2" album = {} albums = [] count = e_count album["score"] = "" album["id"] = "" album["title"] = "" album["artist"] = "" album["artist_id"] = "" log( "Artist: %s" % smart_unicode( artist ), xbmc.LOGDEBUG ) album_temp = smart_unicode( album_title ) artist = smart_unicode( get_unicode( artist ) ) album_title = smart_unicode( get_unicode( album_title ) ) log( "Artist: %s" % artist, xbmc.LOGDEBUG ) log( "Album: %s" % album_title, xbmc.LOGDEBUG ) artist = artist.replace('"','?') album_title = album_title.replace('"','?') if limit == 1: if not use_alias: url = release_group_url_artist % ( server, quote_plus( album_title.encode("utf-8") ), match_within, quote_plus( artist.encode("utf-8") ) ) if not with_singles and not by_release and not use_live: log( "Retrieving MusicBrainz Info - Checking by Artist - Not including Singles or Live albums", xbmc.LOGDEBUG ) url = url + nolive_nosingles + query_limit % limit elif not with_singles and not by_release and use_live: log( "Retrieving MusicBrainz Info - Checking by Artist - Not including Singles", xbmc.LOGDEBUG ) url = url + live_nosingles + query_limit % limit elif not by_release: log( "Retrieving MusicBrainz Info - Checking by Artist - Including Singles and Live albums", xbmc.LOGDEBUG ) url = url + query_limit % limit elif not with_singles: log( "Retrieving MusicBrainz Info - Checking by Artist - Using Release Name", xbmc.LOGDEBUG ) url = release_group_url_artist % ( server, quote_plus( album_title.encode("utf-8") ), match_within, quote_plus( artist.encode("utf-8") ) ) + query_limit % limit elif use_alias: url = release_group_url_alias % ( server, quote_plus( album_title.encode("utf-8") ), match_within, quote_plus( artist.encode("utf-8") ) ) if not with_singles and not by_release and not use_live: log( "Retrieving MusicBrainz Info - Checking by Artist - Not including Singles or Live albums", xbmc.LOGDEBUG ) url = url + nolive_nosingles + query_limit % limit elif not with_singles and not by_release and use_live: log( "Retrieving MusicBrainz Info - Checking by Artist - Not including Singles", xbmc.LOGDEBUG ) url = url + live_nosingles + query_limit % limit elif not by_release: log( "Retrieving MusicBrainz Info - Checking by Artist - Including Singles and Live albums", xbmc.LOGDEBUG ) url = url + query_limit % limit elif not with_singles: log( "Retrieving MusicBrainz Info - Checking by Artist - Using Release Name", xbmc.LOGDEBUG ) url = release_group_url_alias % ( server, quote_plus( album_title.encode("utf-8") ), match_within, quote_plus( artist.encode("utf-8") ) ) + query_limit % limit htmlsource = get_html_source( url, "", save_file = False, overwrite = False ) match = re.search( '''<release-group-list count="(?:.*?)" offset="(?:.*?)">(.*?)</release-group-list>''', htmlsource ) if match: try: mbid = re.search( '''<release-group id="(.*?)"(?:.*?)">''', htmlsource) if not mbid: mbid = re.search( '''<release-group (?:.*?)id="(.*?)">''', htmlsource ) mbtitle = re.search( '''<title>(.*?)</title>''', htmlsource) mbartist = re.search( '''<name>(.*?)</name>''', htmlsource) mbartistid = re.search( '''<artist id="(.*?)">''', htmlsource) album["id"] = mbid.group(1) album["title"] = unescape( smart_unicode( mbtitle.group(1) ) ) album["artist"] = unescape( smart_unicode( mbartist.group(1) ) ) album["artist_id"] = mbartistid.group(1) except: pass if not album["id"]: xbmc.sleep( mb_delay ) # sleep for allowing proper use of webserver if not with_singles and not by_release and not use_alias and not use_live: log( "No releases found on MusicBrainz, Checking For Live Album", xbmc.LOGDEBUG ) album, albums = get_musicbrainz_album( album_title, artist, 0, limit, False, False, False, True ) # try again by using artist alias elif not with_singles and not by_release and not use_alias and use_live: log( "No releases found on MusicBrainz, Checking by Artist Alias", xbmc.LOGDEBUG ) album, albums = get_musicbrainz_album( album_title, artist, 0, limit, False, False, True, False ) # try again by using artist alias elif use_alias and not with_singles and not by_release and not use_live: log( "No releases found on MusicBrainz, Checking by Release Name", xbmc.LOGDEBUG ) album, albums = get_musicbrainz_album( album_title, artist, 0, limit, False, True, False, False ) # try again by using release name elif by_release and not with_singles and not use_alias: log( "No releases found on MusicBrainz, Checking by Release name and Artist Alias", xbmc.LOGDEBUG ) album, albums = get_musicbrainz_album( album_title, artist, 0, limit, False, True, True, False ) # try again by using release name and artist alias elif by_release and not with_singles and use_alias: log( "No releases found on MusicBrainz, checking singles", xbmc.LOGDEBUG ) album, albums = get_musicbrainz_album( album_title, artist, 0, limit, True, False, False, False ) # try again with singles elif with_singles and not use_alias and not by_release: log( "No releases found on MusicBrainz, checking singles and Artist Alias", xbmc.LOGDEBUG ) album, albums = get_musicbrainz_album( album_title, artist, 0, limit, True, False, True, False ) # try again with singles and artist alias else: log( "No releases found on MusicBrainz.", xbmc.LOGDEBUG ) album["artist"], album["artist_id"], sort_name = get_musicbrainz_artist_id( artist ) else: match_within = "~4" url = release_group_url_artist % ( server, ( album_title.encode("utf-8") ), match_within, ( artist.encode("utf-8") ) ) + query_limit % limit htmlsource = get_html_source( url, "", save_file = False, overwrite = False ) match = re.search( '''<release-group-list count="(?:.*?)" offset="(?:.*?)">(.*?)</release-group-list>''', htmlsource ) if match: match_release_group = re.findall( '''<release-group(.*?)</release-group>''', match.group( 1 ) ) if match_release_group: for item in match_release_group: album = {} album["score"] = "" album["id"] = "" album["title"] = "" album["artist"] = "" album["artist_id"] = "" try: mbscore = re.search( '''score="(.*?)"''', item) mbid = re.search( '''<release-group id="(.*?)"(?:.*?)">''', item) if not mbid: mbid = re.search( '''id="(.*?)"(?:.*?)">''', item) if not mbid: mbid = re.search( '''<release-group (?:.*?)id="(.*?)">''', htmlsource ) mbtitle = re.search( '''<title>(.*?)</title>''', item) mbartist = re.search( '''<name>(.*?)</name>''', item) mbartistid = re.search( '''<artist id="(.*?)">''', item) album["score"] = mbscore.group(1) album["id"] = mbid.group(1) album["title"] = unescape( smart_unicode( mbtitle.group(1) ) ) album["artist"] = unescape( smart_unicode( mbartist.group(1) ) ) album["artist_id"] = mbartistid.group(1) log( "Score : %s" % album["score"], xbmc.LOGDEBUG ) log( "Title : %s" % album["title"], xbmc.LOGDEBUG ) log( "Id : %s" % album["id"], xbmc.LOGDEBUG ) log( "Artist : %s" % album["artist"], xbmc.LOGDEBUG ) log( "Artist ID : %s" % album["artist_id"], xbmc.LOGDEBUG ) albums.append(album) except: print_exc() else: pass else: pass xbmc.sleep( mb_delay ) # sleep for allowing proper use of webserver return album, albums
def download_art( url_cdart, album, database_id, type, mode, size, background = False ): log( "Downloading artwork... ", xbmc.LOGDEBUG ) download_success = False thumb_path = "" percent = 1 is_canceled = False if mode == "auto": dialog_msg( "update", percent = percent, background = background ) else: dialog_msg( "create", heading = __language__(32047), background = background ) #Onscreen Dialog - "Downloading...." file_name = get_filename( type, url_cdart, mode ) #Helix: paths MUST end with trailing slash path = os.path.join(album["path"].replace( "\\\\" , "\\" ), '') if file_name == "unknown": log( "Unknown Type ", xbmc.LOGDEBUG ) message = [ __language__(32026), __language__(32025), "File: %s" % get_unicode( path ), "Url: %s" % get_unicode( url_cdart ) ] return message, download_success if type in ( "artistthumb", "cover" ): thumbnail_path = get_thumbnail_path( database_id, type ) else: thumbnail_path = "" if type == "fanart" and mode in ( "manual", "single" ): thumbnail_path = get_fanart_path( database_id, type ) if not exists( path ): try: pathsuccess = _makedirs( album["path"].replace( "\\\\" , "\\" ) ) except: pass log( "Path: %s" % path, xbmc.LOGDEBUG ) log( "Filename: %s" % file_name, xbmc.LOGDEBUG ) log( "url: %s" % url_cdart, xbmc.LOGDEBUG ) # cosmetic: use subfolder for downloading instead of work folder if not exists( os.path.join(tempgfx_folder, '').replace( "\\\\","\\" )): _makedirs(os.path.join(tempgfx_folder, '').replace( "\\\\","\\" )) destination = os.path.join(tempgfx_folder, file_name).replace( "\\\\","\\" ) # download to work folder first final_destination = os.path.join( path, file_name ).replace( "\\\\","\\" ) try: #this give the ability to use the progress bar by retrieving the downloading information #and calculating the percentage def _report_hook( count, blocksize, totalsize ): try: percent = int( float( count * blocksize * 100 ) / totalsize ) if percent < 1: percent = 1 if percent > 100: percent = 100 except: percent = 1 if type in ( "fanart", "clearlogo", "artistthumb", "musicbanner" ): dialog_msg( "update", percent = percent, line1 = "%s%s" % ( __language__(32038) , get_unicode( album["artist"] ) ), background = background ) else: dialog_msg( "update", percent = percent, line1 = "%s%s" % ( __language__(32038) , get_unicode( album["artist"] ) ), line2 = "%s%s" % ( __language__(32039) , get_unicode( album["title"] ) ), background = background ) if mode == "auto": if dialog_msg( "iscanceled", background = background ): is_canceled = True if exists( path ): fp, h = urllib.urlretrieve(url_cdart, destination, _report_hook) #message = ["Download Sucessful!"] message = [__language__(32023), __language__(32024), "File: %s" % get_unicode( path ), "Url: %s" % get_unicode( url_cdart )] success = file_copy( destination, final_destination ) # copy it to album folder # update database try: conn = sqlite3.connect(addon_db) c = conn.cursor() if type == "cdart": c.execute('''UPDATE alblist SET cdart="True" WHERE path="%s"''' % ( get_unicode( album["path"] ) ) ) elif type == "cover": c.execute('''UPDATE alblist SET cover="True" WHERE path="%s"''' % ( get_unicode( album["path"] ) ) ) conn.commit() c.close() except: log( "Error updating database", xbmc.LOGDEBUG ) print_exc() download_success = True else: log( "Path error", xbmc.LOGDEBUG ) log( " file path: %s" % repr( destination ), xbmc.LOGDEBUG ) message = [ __language__(32026), __language__(32025) , "File: %s" % get_unicode( path ), "Url: %s" % get_unicode( url_cdart ) ] #message = Download Problem, Check file paths - Artwork Not Downloaded] # always cleanup downloaded files #if type == "fanart": delete_file( destination ) except: log( "General download error", xbmc.LOGDEBUG ) message = [ __language__(32026), __language__(32025), "File: %s" % get_unicode( path ), "Url: %s" % get_unicode( url_cdart ) ] #message = [Download Problem, Check file paths - Artwork Not Downloaded] print_exc() if mode == "auto" or mode == "single": return message, download_success, final_destination, is_canceled # returns one of the messages built based on success or lack of else: dialog_msg( "close", background = background ) return message, download_success, is_canceled
def _report_hook( count, blocksize, totalsize ): try: percent = int( float( count * blocksize * 100 ) / totalsize ) if percent < 1: percent = 1 if percent > 100: percent = 100 except: percent = 1 if type in ( "fanart", "clearlogo", "artistthumb", "musicbanner" ): dialog_msg( "update", percent = percent, line1 = "%s%s" % ( __language__(32038) , get_unicode( album["artist"] ) ), background = background ) else: dialog_msg( "update", percent = percent, line1 = "%s%s" % ( __language__(32038) , get_unicode( album["artist"] ) ), line2 = "%s%s" % ( __language__(32039) , get_unicode( album["title"] ) ), background = background ) if mode == "auto": if dialog_msg( "iscanceled", background = background ): is_canceled = True
def auto_download( type, artist_list, background=False ): is_canceled = False log( "Autodownload", xbmc.LOGDEBUG ) try: artist_count = 0 download_count = 0 cdart_existing = 0 album_count = 0 d_error=False percent = 1 successfully_downloaded = [] if type in ( "clearlogo_allartists", "artistthumb_allartists", "fanart_allartists", "musicbanner_allartists" ): if type == "clearlogo_allartists": type = "clearlogo" elif type == "artistthumb_allartists": type = "artistthumb" elif type == "musicbanner_allartists": type = "musicbanner" else: type = "fanart" count_artist_local = len( artist_list ) dialog_msg( "create", heading = __language__(32046), background = background ) #Onscreen Dialog - Automatic Downloading of Artwork key_label = type for artist in artist_list: low_res = True if dialog_msg( "iscanceled", background = background ) or is_canceled: is_canceled = True break artist_count += 1 if not artist["has_art"] == "True": # If fanart.tv does not report that it has an artist match skip it. continue percent = int( (artist_count / float(count_artist_local) ) * 100) if percent < 1: percent = 1 if percent > 100: percent = 100 log( "Artist: %-40s Local ID: %-10s Distant MBID: %s" % ( artist["name"], artist["local_id"], artist["musicbrainz_artistid"] ), xbmc.LOGNOTICE ) if type in ( "fanart", "clearlogo", "artistthumb", "musicbanner" ) and artist[ "has_art" ]: dialog_msg( "update", percent = percent, line1 = "%s%s" % ( __language__(32038), get_unicode( artist["name"] ) ), background = background ) auto_art = {} temp_art = {} temp_art["musicbrainz_artistid"] = artist["musicbrainz_artistid"] auto_art["musicbrainz_artistid"] = artist["musicbrainz_artistid"] temp_art["artist"] = artist["name"] auto_art["artist"] = artist["name"] path = os.path.join( music_path, change_characters( smart_unicode( artist["name"] ) ) ) if type == "fanart": art = remote_fanart_list( auto_art ) elif type == "clearlogo": art = remote_clearlogo_list( auto_art ) arthd = remote_hdlogo_list( auto_art ) elif type == "musicbanner": art = remote_banner_list( auto_art ) else: art = remote_artistthumb_list( auto_art ) if art: if type == "fanart": temp_art["path"] = path auto_art["path"] = os.path.join( path, "extrafanart" ).replace( "\\\\" , "\\" ) if not exists( auto_art["path"] ): try: if _makedirs( auto_art["path"] ): log( "extrafanart directory made", xbmc.LOGDEBUG ) except: print_exc() log( "unable to make extrafanart directory", xbmc.LOGDEBUG ) continue else: log( "extrafanart directory already exists", xbmc.LOGDEBUG ) else: auto_art["path"] = path if type == "fanart": if enable_fanart_limit: fanart_dir, fanart_files = listdir( auto_art["path"] ) fanart_number = len( fanart_files ) if fanart_number == fanart_limit: continue if not exists( os.path.join( path, "fanart.jpg" ).replace( "\\\\", "\\" ) ): message, d_success, final_destination, is_canceled = download_art( art[0], temp_art, artist["local_id"], "fanart", "single", 0, background ) for artwork in art: fanart = {} if enable_fanart_limit and fanart_number == fanart_limit: log( "Fanart Limit Reached", xbmc.LOGNOTICE ) continue if exists( os.path.join( auto_art["path"], os.path.basename( artwork ) ) ): log( "Fanart already exists, skipping", xbmc.LOGDEBUG ) continue else: message, d_success, final_destination, is_canceled = download_art( artwork, auto_art, artist["local_id"], "fanart", "auto", 0, background ) if d_success == 1: if enable_fanart_limit: fanart_number += 1 download_count += 1 fanart["artist"] = auto_art["artist"] fanart["path"] = final_destination successfully_downloaded.append( fanart ) else: log( "Download Error... Check Path.", xbmc.LOGDEBUG ) log( " Path: %s" % auto_art["path"], xbmc.LOGDEBUG ) d_error = True else: if type == "clearlogo": if arthd and enable_hdlogos: artwork = arthd[0] else: artwork = art[0] else: artwork = art[0] if type == "artistthumb": if resizeondownload: low_res = check_size( auto_art["path"], key_label, 1000, 1000 ) # Fixed always redownloading Thumbs else: low_res = False if exists( os.path.join( auto_art["path"], "folder.jpg" ) ) and not low_res: log( "Artist Thumb already exists, skipping", xbmc.LOGDEBUG ) continue else: message, d_success, final_destination, is_canceled = download_art( artwork , auto_art, artist["local_id"], "artistthumb", "auto", 0, background ) elif type == "clearlogo": if enable_hdlogos and resizeondownload and arthd: low_res = check_size( auto_art["path"], key_label, 800, 310 ) else: low_res = False if exists( os.path.join( auto_art["path"], "logo.png" ) ) and not low_res: log( "ClearLOGO already exists, skipping", xbmc.LOGDEBUG ) continue else: message, d_success, final_destination, is_canceled = download_art( artwork , auto_art, artist["local_id"], "clearlogo", "auto", 0, background ) elif type == "musicbanner": if exists( os.path.join( auto_art["path"], "banner.jpg" ) ): log( "Music Banner already exists, skipping", xbmc.LOGDEBUG ) continue else: message, d_success, final_destination, is_canceled = download_art( artwork , auto_art, artist["local_id"], "musicbanner", "auto", 0, background ) if d_success == 1: download_count += 1 auto_art["path"] = final_destination successfully_downloaded.append( auto_art ) else: log( "Download Error... Check Path.", xbmc.LOGDEBUG ) log( " Path: %s" % auto_art["path"], xbmc.LOGDEBUG ) d_error = True else : log( "Artist Match not found", xbmc.LOGDEBUG ) elif type in ( "cdart", "cover" ) and artist[ "has_art" ]: local_album_list = get_local_albums_db( artist["name"], background ) if type == "cdart": remote_art_url = remote_cdart_list( artist ) else: remote_art_url = remote_coverart_list( artist ) for album in local_album_list: low_res = True if dialog_msg( "iscanceled", background = background ): break if not remote_art_url: log( "No artwork found", xbmc.LOGDEBUG ) break album_count += 1 if not album["musicbrainz_albumid"]: continue dialog_msg( "update", percent = percent, line1 = "%s%s" % ( __language__(32038) , get_unicode( artist["name"] ) ), line2 = "%s%s" % (__language__(32039) , get_unicode( album["title"] ) ), background = background ) name = artist["name"] title = album["title"] log( "Album: %s" % album["title"], xbmc.LOGDEBUG ) if not album[key_label] or resizeondownload: musicbrainz_albumid = album["musicbrainz_albumid"] art = artwork_search( remote_art_url, musicbrainz_albumid, album["disc"], key_label ) if art: if resizeondownload: low_res = check_size( album["path"].replace( "\\\\", "\\" ), key_label, art["size"], art["size"] ) if art["picture"]: log( "ALBUM MATCH ON FANART.TV FOUND", xbmc.LOGDEBUG ) #log( "test_album[0]: %s" % test_album[0], xbmc.LOGDEBUG ) if low_res: message, d_success, final_destination, is_canceled = download_art( art["picture"], album, album["local_id"], key_label, "auto", 0, background ) if d_success == 1: download_count += 1 album[key_label] = True album["path"] = final_destination successfully_downloaded.append( album ) else: log( "Download Error... Check Path.", xbmc.LOGDEBUG ) log( " Path: %s" % repr( album["path"] ), xbmc.LOGDEBUG ) d_error = True else: pass else: log( "ALBUM NOT MATCHED ON FANART.TV", xbmc.LOGDEBUG ) else: log( "ALBUM NOT MATCHED ON FANART.TV", xbmc.LOGDEBUG ) else: log( "%s artwork file already exists, skipping..." % key_label, xbmc.LOGDEBUG ) dialog_msg( "close", background = background ) if d_error: dialog_msg( "ok", line1 = __language__(32026), line2 = "%s: %s" % ( __language__(32041), download_count ), background = background ) else: dialog_msg( "ok", line1 = __language__(32040), line2 = "%s: %s" % ( __language__(32041), download_count ), background = background ) return download_count, successfully_downloaded except: print_exc() dialog_msg( "close", background = background )
def set_meta_img(self, src_url): self.meta_img = get_unicode(src_url) self.set_top_img_no_check(src_url)
def set_article_html(self, article_html): """Sets the HTML of just the article's `top_node` """ self.article_html = get_unicode(article_html)
def set_html(self, html): """Encode HTML before setting it """ self.is_downloaded = True self.html = get_unicode(html, is_html=True)
def set_text(self, text): text = text[:self.config.MAX_TEXT] self.text = get_unicode(text)
def __init__(self, url, title=u'', source_url=u'', config=None, **kwargs): """The **kwargs argument may be filled with config values, which is added into the config object """ self.config = config or Configuration() self.config = extend_config(self.config, kwargs) self.extractor = ContentExtractor(self.config) if source_url == u'': source_url = urls.get_scheme(url) + '://' + urls.get_domain(url) if source_url is None or source_url == '': raise ArticleException('input url bad format') # URL to the main page of the news source which owns this article self.source_url = get_unicode(source_url) url = get_unicode(url) self.url = urls.prepare_url(url, self.source_url) self.title = get_unicode(title) # URL of the "best image" to represent this article self.top_img = self.top_image = u'' # stores image provided by metadata self.meta_img = u'' # All image urls in this article self.imgs = self.images = [] # All videos in this article: youtube, vimeo, etc self.movies = [] # Body text from this article self.text = u'' # `keywords` are extracted via nlp() from the body text self.keywords = [] # `meta_keywords` are extracted via parse() from <meta> tags self.meta_keywords = [] # `tags` are also extracted via parse() from <meta> tags self.tags = set() # List of authors who have published the article, via parse() self.authors = [] self.publish_date = u'' # Summary generated from the article's body txt self.summary = u'' # This article's unchanged and raw HTML self.html = u'' # The HTML of this article's main node (most important part) self.article_html = u'' # Flags warning users in-case they forget to download() or parse() # or if they call methods out of order self.is_parsed = False self.is_downloaded = False # Meta description field in the HTML source self.meta_description = u"" # Meta language field in HTML source self.meta_lang = u"" # Meta favicon field in HTML source self.meta_favicon = u"" # Meta tags contain a lot of structured data, e.g. OpenGraph self.meta_data = {} # The canonical link of this article if found in the meta data self.canonical_link = u"" # Holds the top element of the DOM that we determine is a candidate # for the main body of the article self.top_node = None # A deepcopied clone of the above object before heavy parsing # operations, useful for users to query data in the # "most important part of the page" self.clean_top_node = None # lxml DOM object generated from HTML self.doc = None # A deepcopied clone of the above object before undergoing heavy # cleaning operations, serves as an API if users need to query the DOM self.clean_doc = None # A property dict for users to store custom data. self.additional_data = {}
def get_musicbrainz_album(album_title, artist, e_count, limit=1, with_singles=False, by_release=False, use_alias=False, use_live=False): """ Retrieves information for Album from MusicBrainz using provided Album title and Artist name. Use: album, albums = get_musicbrainz_album( album_title, artist, e_count, limit, with_singles, by_release ) album_title - the album title(must be unicode) artist - the artist's name(must be unicode) e_count - used internally(should be set to 0) limit - limit the number of responses with_singles - set to True to look up single releases at the same time by_release - use release name for search """ match_within = "~2" album = {} albums = [] count = e_count album["score"] = "" album["id"] = "" album["title"] = "" album["artist"] = "" album["artist_id"] = "" album_temp = smart_unicode(album_title) artist = smart_unicode(get_unicode(artist)) album_title = smart_unicode(get_unicode(album_title)) log("Artist: %s" % artist, xbmc.LOGDEBUG) log("Album: %s" % album_title, xbmc.LOGDEBUG) artist = artist.replace('"', '?') artist = artist.replace('&', 'and') album_title = album_title.replace('"', '?') album_title = album_title.replace('&', 'and') if limit == 1: if not use_alias: url = release_group_url_artist % ( server, quote_plus(album_title.encode("utf-8")), match_within, quote_plus(artist.encode("utf-8"))) if not with_singles and not by_release and not use_live: log( "Retrieving MusicBrainz Info - Checking by Artist - Not including Singles or Live albums", xbmc.LOGDEBUG) url = url + nolive_nosingles + query_limit % limit elif not with_singles and not by_release and use_live: log( "Retrieving MusicBrainz Info - Checking by Artist - Not including Singles", xbmc.LOGDEBUG) url = url + live_nosingles + query_limit % limit elif not by_release: log( "Retrieving MusicBrainz Info - Checking by Artist - Including Singles and Live albums", xbmc.LOGDEBUG) url = url + query_limit % limit elif not with_singles: log( "Retrieving MusicBrainz Info - Checking by Artist - Using Release Name", xbmc.LOGDEBUG) url = release_group_url_artist % ( server, quote_plus( album_title.encode("utf-8")), match_within, quote_plus(artist.encode("utf-8"))) + query_limit % limit elif use_alias: url = release_group_url_alias % ( server, quote_plus(album_title.encode("utf-8")), match_within, quote_plus(artist.encode("utf-8"))) if not with_singles and not by_release and not use_live: log( "Retrieving MusicBrainz Info - Checking by Artist - Not including Singles or Live albums", xbmc.LOGDEBUG) url = url + nolive_nosingles + query_limit % limit elif not with_singles and not by_release and use_live: log( "Retrieving MusicBrainz Info - Checking by Artist - Not including Singles", xbmc.LOGDEBUG) url = url + live_nosingles + query_limit % limit elif not by_release: log( "Retrieving MusicBrainz Info - Checking by Artist - Including Singles and Live albums", xbmc.LOGDEBUG) url = url + query_limit % limit elif not with_singles: log( "Retrieving MusicBrainz Info - Checking by Artist - Using Release Name", xbmc.LOGDEBUG) url = release_group_url_alias % ( server, quote_plus( album_title.encode("utf-8")), match_within, quote_plus(artist.encode("utf-8"))) + query_limit % limit htmlsource = get_html_source(url, "", save_file=False, overwrite=False) match = re.search( '''<release-group-list count="(?:.*?)" offset="(?:.*?)">(.*?)</release-group-list>''', htmlsource) if match: try: mbid = re.search('''<release-group id="(.*?)"(?:.*?)">''', htmlsource) if not mbid: mbid = re.search('''<release-group (?:.*?)id="(.*?)">''', htmlsource) mbtitle = re.search('''<title>(.*?)</title>''', htmlsource) mbartist = re.search('''<name>(.*?)</name>''', htmlsource) mbartistid = re.search('''<artist id="(.*?)">''', htmlsource) album["id"] = mbid.group(1) album["title"] = unescape(smart_unicode(mbtitle.group(1))) album["artist"] = unescape(smart_unicode(mbartist.group(1))) album["artist_id"] = mbartistid.group(1) except: pass if not album["id"]: xbmc.sleep(mb_delay) # sleep for allowing proper use of webserver if not with_singles and not by_release and not use_alias and not use_live: log( "No releases found on MusicBrainz, Checking For Live Album", xbmc.LOGDEBUG) album, albums = get_musicbrainz_album( album_title, artist, 0, limit, False, False, False, True) # try again by using artist alias elif not with_singles and not by_release and not use_alias and use_live: log( "No releases found on MusicBrainz, Checking by Artist Alias", xbmc.LOGDEBUG) album, albums = get_musicbrainz_album( album_title, artist, 0, limit, False, False, True, False) # try again by using artist alias elif use_alias and not with_singles and not by_release and not use_live: log( "No releases found on MusicBrainz, Checking by Release Name", xbmc.LOGDEBUG) album, albums = get_musicbrainz_album( album_title, artist, 0, limit, False, True, False, False) # try again by using release name elif by_release and not with_singles and not use_alias: log( "No releases found on MusicBrainz, Checking by Release name and Artist Alias", xbmc.LOGDEBUG) album, albums = get_musicbrainz_album( album_title, artist, 0, limit, False, True, True, False) # try again by using release name and artist alias elif by_release and not with_singles and use_alias: log("No releases found on MusicBrainz, checking singles", xbmc.LOGDEBUG) album, albums = get_musicbrainz_album( album_title, artist, 0, limit, True, False, False, False) # try again with singles elif with_singles and not use_alias and not by_release: log( "No releases found on MusicBrainz, checking singles and Artist Alias", xbmc.LOGDEBUG) album, albums = get_musicbrainz_album( album_title, artist, 0, limit, True, False, True, False) # try again with singles and artist alias else: log("No releases found on MusicBrainz.", xbmc.LOGDEBUG) album["artist"], album[ "artist_id"], sort_name = get_musicbrainz_artist_id(artist) else: match_within = "~4" url = release_group_url_artist % ( server, (album_title.encode("utf-8")), match_within, (artist.encode("utf-8"))) + query_limit % limit htmlsource = get_html_source(url, "", save_file=False, overwrite=False) match = re.search( '''<release-group-list count="(?:.*?)" offset="(?:.*?)">(.*?)</release-group-list>''', htmlsource) if match: match_release_group = re.findall( '''<release-group(.*?)</release-group>''', match.group(1)) if match_release_group: for item in match_release_group: album = {} album["score"] = "" album["id"] = "" album["title"] = "" album["artist"] = "" album["artist_id"] = "" try: mbscore = re.search('''score="(.*?)"''', item) mbid = re.search( '''<release-group id="(.*?)"(?:.*?)">''', item) if not mbid: mbid = re.search('''id="(.*?)"(?:.*?)">''', item) if not mbid: mbid = re.search( '''<release-group (?:.*?)id="(.*?)">''', htmlsource) mbtitle = re.search('''<title>(.*?)</title>''', item) mbartist = re.search('''<name>(.*?)</name>''', item) mbartistid = re.search('''<artist id="(.*?)">''', item) album["score"] = mbscore.group(1) album["id"] = mbid.group(1) album["title"] = unescape( smart_unicode(mbtitle.group(1))) album["artist"] = unescape( smart_unicode(mbartist.group(1))) album["artist_id"] = mbartistid.group(1) log("Score : %s" % album["score"], xbmc.LOGDEBUG) log("Title : %s" % album["title"], xbmc.LOGDEBUG) log("Id : %s" % album["id"], xbmc.LOGDEBUG) log("Artist : %s" % album["artist"], xbmc.LOGDEBUG) log("Artist ID : %s" % album["artist_id"], xbmc.LOGDEBUG) albums.append(album) except: print_exc() else: pass else: pass xbmc.sleep(mb_delay) # sleep for allowing proper use of webserver return album, albums