Example #1
0
def artist_musicbrainz_id( artist_id, artist_mbid ):
    artist_details = retrieve_artist_details( artist_id )
    artist = []
    if not artist_details["musicbrainzartistid"] or not artist_mbid:
        name, artist["musicbrainz_artistid"], sortname = get_musicbrainz_artist_id( get_unicode( artist_details["label"] ) )
        artist[ "name" ] = get_unicode( artist_details[ "label" ] )
    else:
        artist[ "name" ] = get_unicode( artist_details["label"] )
        if artist_mbid:
            artist[ "musicbrainz_artistid" ] = artist_mbid
        else:
            artist[ "musicbrainz_artistid" ] = artist_details["musicbrainzartistid"]
    return artist
Example #2
0
def artist_musicbrainz_id( artist_id, artist_mbid ):
    artist_details = retrieve_artist_details( artist_id )
    artist = []
    if not artist_details["musicbrainzartistid"] or not artist_mbid:
        name, artist["musicbrainz_artistid"], sortname = get_musicbrainz_artist_id( get_unicode( artist_details["label"] ) )
        artist[ "name" ] = get_unicode( artist_details[ "label" ] )
    else:
        artist[ "name" ] = get_unicode( artist_details["label"] )
        if artist_mbid:
            artist[ "musicbrainz_artistid" ] = artist_mbid
        else:
            artist[ "musicbrainz_artistid" ] = artist_details["musicbrainzartistid"]
    return artist
Example #3
0
    def get_records(self):
        """Read all rows of workbook and parse them looking to build database
        records from them. Yield records as soon as they are built."""

        # load active sheet of wb
        ws = self.wb.get_active_sheet()

        # create AbbyParser instance
        ap = AbbyParser(self.parsers, self.context, self.record_builder)

        # iterate through all abby_file wb rows
        for row in ws.iter_rows():
            cells_values = []

            # iterate through all cells in row
            for cell in row:
                # add cell value
                cells_values.append(get_unicode(cell.internal_value))

            # checks if list of cell values not empty
            if not self._empty(cells_values):
                # remove any empty cell that might be at the end of row
                cells_values = self._remove_lasts_none(cells_values)

                # parse all posible records from row (list of cell values)
                record_lines = ap.parse_row(cells_values)

                # yields any record built from row
                for record in record_lines:
                    yield record
Example #4
0
    def get_records(self):
        """Read all rows of workbook and parse them looking to build database
        records from them. Yield records as soon as they are built."""

        # load active sheet of wb
        ws = self.wb.get_active_sheet()

        # create AbbyParser instance
        ap = AbbyParser(self.parsers, self.context, self.record_builder)

        # iterate through all abby_file wb rows
        for row in ws.iter_rows():
            cells_values = []

            # iterate through all cells in row
            for cell in row:
                # add cell value
                cells_values.append(get_unicode(cell.internal_value))

            # checks if list of cell values not empty
            if not self._empty(cells_values):
                # remove any empty cell that might be at the end of row
                cells_values = self._remove_lasts_none(cells_values)

                # parse all posible records from row (list of cell values)
                record_lines = ap.parse_row(cells_values)

                # yields any record built from row
                for record in record_lines:
                    yield record
Example #5
0
 def set_imgs(self, imgs):
     """The motive for this method is the same as above, provide APIs
     for both `article.imgs` and `article.images`
     """
     imgs = [get_unicode(i) for i in imgs]
     self.images = imgs
     self.imgs = imgs
Example #6
0
 def set_top_img_no_check(self, src_url):
     """Provide 2 APIs for images. One at "top_img", "imgs"
     and one at "top_image", "images"
     """
     src_url = get_unicode(src_url)
     self.top_img = src_url
     self.top_image = src_url
Example #7
0
 def set_title(self, title):
     if self.title and not title:
         # Title has already been set by an educated guess and
         # <title> extraction failed
         return
     title = title[:self.config.MAX_TITLE]
     self.title = get_unicode(title)
Example #8
0
 def set_authors(self, authors):
     """Authors are in ["firstName lastName", "firstName lastName"] format
     """
     if not isinstance(authors, list):
         raise Exception("authors input must be list!")
     if authors:
         authors = authors[:self.config.MAX_AUTHORS]
         self.authors = [get_unicode(author) for author in authors]
Example #9
0
 def set_keywords(self, keywords):
     """Keys are stored in list format
     """
     if not isinstance(keywords, list):
         raise Exception("Keyword input must be list!")
     if keywords:
         self.keywords = [
             get_unicode(k) for k in keywords[:self.config.MAX_KEYWORDS]
         ]
Example #10
0
    def _normalize_product_units(self, product_units):
        """Normalize string for "kilogramos" unit, if that is the product_unit
        passed."""
        RV = product_units

        # uses the long form of the unit
        if get_unicode(product_units.strip()) == u"Kg.":
            RV = u"kilogramos"

        return RV
Example #11
0
 def fromstring(cls, html):
     html = utils.get_unicode(html, is_html=True)
     # Enclosed in a `try` to prevent bringing the entire library
     # down due to one article (out of potentially many in a `Source`)
     try:
         # Remove encoding tag because lxml won't accept it for
         # unicode objects (Issue #78)
         if html.startswith('<?'):
             html = re.sub(r'^\<\?.*?\?\>', '', html, flags=re.DOTALL)
         cls.doc = lxml.html.fromstring(html)
         return cls.doc
     except Exception:
         traceback.print_exc()
         return None
Example #12
0
    def parse(self):
        """Merge both parts of head1 row and call Head1Parser."""

        # forma la row completa de tipo Head1
        tbl_head1_row = [get_unicode(self.context.last_row + u" " +
                         self.row[0])]
        tbl_head1_row = [tbl_head1_row[0].strip()]

        # check that row is accepted and use parser
        if Head1Parser(tbl_head1_row).accepts():
            Head1Parser(tbl_head1_row, self.context).parse()
        # if not accepted, print an error
        else:
            print tbl_head1_row
            print "Ocurrio un error con un Head1Parser partido!"

        # declara cual fue el ultimo row type procesado
        self.context.row_type = "tbl_head1_final_part"
Example #13
0
def fulltext(html, language='en'):
    """Takes article HTML string input and outputs the fulltext
    Input string is decoded via UnicodeDammit if needed
    """
    from .cleaners import DocumentCleaner
    from .configuration import Configuration
    from .extractors import ContentExtractor
    from .outputformatters import OutputFormatter

    config = Configuration()
    config.language = language

    extractor = ContentExtractor(config)
    document_cleaner = DocumentCleaner(config)
    output_formatter = OutputFormatter(config)

    doc = config.get_parser().fromstring(html)
    doc = document_cleaner.clean(doc)

    top_node = extractor.calculate_best_node(doc)
    top_node = extractor.post_cleanup(top_node)
    text, article_html = output_formatter.get_formatted(top_node)
    return get_unicode(text)
Example #14
0
 def set_summary(self, summary):
     """Summary here refers to a paragraph of text from the
     title text and body text
     """
     summary = summary[:self.config.MAX_SUMMARY]
     self.summary = get_unicode(summary)
def get_musicbrainz_album( album_title, artist, e_count, limit=1, with_singles=False, by_release=False, use_alias=False, use_live=False ):
    """ Retrieves information for Album from MusicBrainz using provided Album title and Artist name. 
        
        Use:
            album, albums = get_musicbrainz_album( album_title, artist, e_count, limit, with_singles, by_release )
        
        album_title  - the album title(must be unicode)
        artist       - the artist's name(must be unicode)
        e_count      - used internally(should be set to 0)
        limit        - limit the number of responses
        with_singles - set to True to look up single releases at the same time
        by_release   - use release name for search
    """
    match_within = "~2"
    album = {}
    albums = []
    count = e_count
    album["score"] = ""
    album["id"] = ""
    album["title"] = ""
    album["artist"] = ""
    album["artist_id"] = ""
    log( "Artist: %s" % smart_unicode( artist ), xbmc.LOGDEBUG )
    album_temp = smart_unicode( album_title )
    artist = smart_unicode( get_unicode( artist ) )
    album_title = smart_unicode( get_unicode( album_title ) )
    log( "Artist: %s" % artist, xbmc.LOGDEBUG )
    log( "Album: %s" % album_title, xbmc.LOGDEBUG )
    artist = artist.replace('"','?')
    album_title = album_title.replace('"','?')
    if limit == 1:
        if not use_alias:
            url = release_group_url_artist % ( server, quote_plus( album_title.encode("utf-8") ), match_within, quote_plus( artist.encode("utf-8") ) )
            if not with_singles and not by_release and not use_live:
                log( "Retrieving MusicBrainz Info - Checking by Artist - Not including Singles or Live albums", xbmc.LOGDEBUG )
                url = url + nolive_nosingles + query_limit % limit
            elif not with_singles and not by_release and use_live:
                log( "Retrieving MusicBrainz Info - Checking by Artist - Not including Singles", xbmc.LOGDEBUG )
                url = url + live_nosingles + query_limit % limit
            elif not by_release:
                log( "Retrieving MusicBrainz Info - Checking by Artist - Including Singles and Live albums", xbmc.LOGDEBUG )
                url = url + query_limit % limit
            elif not with_singles:
                log( "Retrieving MusicBrainz Info - Checking by Artist - Using Release Name", xbmc.LOGDEBUG )
                url = release_group_url_artist % ( server, quote_plus( album_title.encode("utf-8") ), match_within, quote_plus( artist.encode("utf-8") ) ) + query_limit % limit
        elif use_alias:
            url = release_group_url_alias % ( server, quote_plus( album_title.encode("utf-8") ), match_within, quote_plus( artist.encode("utf-8") ) )
            if not with_singles and not by_release and not use_live:
                log( "Retrieving MusicBrainz Info - Checking by Artist - Not including Singles or Live albums", xbmc.LOGDEBUG )
                url = url + nolive_nosingles + query_limit % limit
            elif not with_singles and not by_release and use_live:
                log( "Retrieving MusicBrainz Info - Checking by Artist - Not including Singles", xbmc.LOGDEBUG )
                url = url + live_nosingles + query_limit % limit
            elif not by_release:
                log( "Retrieving MusicBrainz Info - Checking by Artist - Including Singles and Live albums", xbmc.LOGDEBUG )
                url = url + query_limit % limit
            elif not with_singles:
                log( "Retrieving MusicBrainz Info - Checking by Artist - Using Release Name", xbmc.LOGDEBUG )
                url = release_group_url_alias % ( server, quote_plus( album_title.encode("utf-8") ), match_within, quote_plus( artist.encode("utf-8") ) ) + query_limit % limit
        htmlsource = get_html_source( url, "", save_file = False, overwrite = False )
        match = re.search( '''<release-group-list count="(?:.*?)" offset="(?:.*?)">(.*?)</release-group-list>''', htmlsource )
        if match:
            try:
                mbid = re.search( '''<release-group id="(.*?)"(?:.*?)">''', htmlsource)
                if not mbid:
                    mbid = re.search( '''<release-group (?:.*?)id="(.*?)">''', htmlsource )
                mbtitle = re.search( '''<title>(.*?)</title>''', htmlsource)
                mbartist = re.search( '''<name>(.*?)</name>''', htmlsource)
                mbartistid = re.search( '''<artist id="(.*?)">''', htmlsource)
                album["id"] = mbid.group(1)
                album["title"] = unescape( smart_unicode( mbtitle.group(1) ) )
                album["artist"] = unescape( smart_unicode( mbartist.group(1) ) )
                album["artist_id"] = mbartistid.group(1)
            except:
                pass            
        if not album["id"]:
            xbmc.sleep( mb_delay ) # sleep for allowing proper use of webserver
            if not with_singles and not by_release and not use_alias and not use_live:
                log( "No releases found on MusicBrainz, Checking For Live Album", xbmc.LOGDEBUG )
                album, albums = get_musicbrainz_album( album_title, artist, 0, limit, False, False, False, True ) # try again by using artist alias
            elif not with_singles and not by_release and not use_alias and use_live:
                log( "No releases found on MusicBrainz, Checking by Artist Alias", xbmc.LOGDEBUG )
                album, albums = get_musicbrainz_album( album_title, artist, 0, limit, False, False, True, False ) # try again by using artist alias
            elif use_alias and not with_singles and not by_release and not use_live:
                log( "No releases found on MusicBrainz, Checking by Release Name", xbmc.LOGDEBUG )
                album, albums = get_musicbrainz_album( album_title, artist, 0, limit, False, True, False, False ) # try again by using release name
            elif by_release and not with_singles and not use_alias:
                log( "No releases found on MusicBrainz, Checking by Release name and Artist Alias", xbmc.LOGDEBUG )
                album, albums = get_musicbrainz_album( album_title, artist, 0, limit, False, True, True, False ) # try again by using release name and artist alias
            elif by_release and not with_singles and use_alias:
                log( "No releases found on MusicBrainz, checking singles", xbmc.LOGDEBUG )
                album, albums = get_musicbrainz_album( album_title, artist, 0, limit, True, False, False, False ) # try again with singles
            elif with_singles and not use_alias and not by_release:
                log( "No releases found on MusicBrainz, checking singles and Artist Alias", xbmc.LOGDEBUG )
                album, albums = get_musicbrainz_album( album_title, artist, 0, limit, True, False, True, False ) # try again with singles and artist alias
            else:
                log( "No releases found on MusicBrainz.", xbmc.LOGDEBUG )
                album["artist"], album["artist_id"], sort_name = get_musicbrainz_artist_id( artist )
    else:
        match_within = "~4"
        url = release_group_url_artist % ( server, ( album_title.encode("utf-8") ), match_within, ( artist.encode("utf-8") ) ) + query_limit % limit
        htmlsource = get_html_source( url, "", save_file = False, overwrite = False )
        match = re.search( '''<release-group-list count="(?:.*?)" offset="(?:.*?)">(.*?)</release-group-list>''', htmlsource )
        if match:
            match_release_group = re.findall( '''<release-group(.*?)</release-group>''', match.group( 1 ) )
            if match_release_group:
                for item in match_release_group:
                    album = {}
                    album["score"] = ""
                    album["id"] = ""
                    album["title"] = ""
                    album["artist"] = ""
                    album["artist_id"] = ""
                    try:
                        mbscore = re.search( '''score="(.*?)"''', item)
                        mbid = re.search( '''<release-group id="(.*?)"(?:.*?)">''', item)
                        if not mbid:
                            mbid = re.search( '''id="(.*?)"(?:.*?)">''', item)
                            if not mbid:
                                mbid = re.search( '''<release-group (?:.*?)id="(.*?)">''', htmlsource )
                        mbtitle = re.search( '''<title>(.*?)</title>''', item)
                        mbartist = re.search( '''<name>(.*?)</name>''', item)
                        mbartistid = re.search( '''<artist id="(.*?)">''', item)
                        album["score"] = mbscore.group(1)
                        album["id"] = mbid.group(1)
                        album["title"] = unescape( smart_unicode( mbtitle.group(1) ) )
                        album["artist"] = unescape( smart_unicode( mbartist.group(1) ) )
                        album["artist_id"] = mbartistid.group(1)
                        log( "Score     : %s" % album["score"], xbmc.LOGDEBUG )
                        log( "Title     : %s" % album["title"], xbmc.LOGDEBUG )
                        log( "Id        : %s" % album["id"], xbmc.LOGDEBUG )
                        log( "Artist    : %s" % album["artist"], xbmc.LOGDEBUG )
                        log( "Artist ID : %s" % album["artist_id"], xbmc.LOGDEBUG )
                        albums.append(album)
                    except:
                        print_exc()
                    
            else:
                pass
        else:
            pass
    xbmc.sleep( mb_delay ) # sleep for allowing proper use of webserver
    return album, albums
Example #16
0
def download_art( url_cdart, album, database_id, type, mode, size, background = False ):
    log( "Downloading artwork... ", xbmc.LOGDEBUG )
    download_success = False 
    thumb_path = ""
    percent = 1
    is_canceled = False
    if mode == "auto":
        dialog_msg( "update", percent = percent, background = background )
    else:
        dialog_msg( "create", heading = __language__(32047), background = background )
        #Onscreen Dialog - "Downloading...."
    file_name = get_filename( type, url_cdart, mode )
    #Helix: paths MUST end with trailing slash
    path = os.path.join(album["path"].replace( "\\\\" , "\\" ), '')
    if file_name == "unknown":
        log( "Unknown Type ", xbmc.LOGDEBUG )
        message = [ __language__(32026), __language__(32025), "File: %s" % get_unicode( path ), "Url: %s" % get_unicode( url_cdart ) ]
        return message, download_success
    if type in ( "artistthumb", "cover" ):
        thumbnail_path = get_thumbnail_path( database_id, type )
    else:
        thumbnail_path = ""
    if type == "fanart" and mode in ( "manual", "single" ):
        thumbnail_path = get_fanart_path( database_id, type )
    if not exists( path ):
        try:
            pathsuccess = _makedirs( album["path"].replace( "\\\\" , "\\" ) )
        except:
            pass
    log( "Path: %s" % path, xbmc.LOGDEBUG )
    log( "Filename: %s" % file_name, xbmc.LOGDEBUG )
    log( "url: %s" % url_cdart, xbmc.LOGDEBUG )
    
    # cosmetic: use subfolder for downloading instead of work folder
    if not exists( os.path.join(tempgfx_folder, '').replace( "\\\\","\\" )):
        _makedirs(os.path.join(tempgfx_folder, '').replace( "\\\\","\\" ))
    destination = os.path.join(tempgfx_folder, file_name).replace( "\\\\","\\" ) # download to work folder first
    final_destination = os.path.join( path, file_name ).replace( "\\\\","\\" )
    try:
        #this give the ability to use the progress bar by retrieving the downloading information
        #and calculating the percentage
        def _report_hook( count, blocksize, totalsize ):
            try:
                percent = int( float( count * blocksize * 100 ) / totalsize )
                if percent < 1:
                    percent = 1
                if percent > 100:
                    percent = 100
            except:
                percent = 1
            if type in ( "fanart", "clearlogo", "artistthumb", "musicbanner" ):
                dialog_msg( "update", percent = percent, line1 = "%s%s" % ( __language__(32038) , get_unicode( album["artist"] ) ), background = background )
            else:
                dialog_msg( "update", percent = percent, line1 = "%s%s" % ( __language__(32038) , get_unicode( album["artist"] ) ), line2 = "%s%s" % ( __language__(32039) , get_unicode( album["title"] ) ), background = background )
            if mode == "auto":
                if dialog_msg( "iscanceled", background = background ):
                    is_canceled = True  
        if exists( path ):
            fp, h = urllib.urlretrieve(url_cdart, destination, _report_hook)
            #message = ["Download Sucessful!"]
            message = [__language__(32023), __language__(32024), "File: %s" % get_unicode( path ), "Url: %s" % get_unicode( url_cdart )]
            success = file_copy( destination, final_destination ) # copy it to album folder
            # update database
            try:
                conn = sqlite3.connect(addon_db)
                c = conn.cursor()
                if type == "cdart":
                    c.execute('''UPDATE alblist SET cdart="True" WHERE path="%s"''' % ( get_unicode( album["path"] ) ) )
                elif type == "cover":
                    c.execute('''UPDATE alblist SET cover="True" WHERE path="%s"''' % ( get_unicode( album["path"] ) ) )
                conn.commit()
                c.close()
            except:
                log( "Error updating database", xbmc.LOGDEBUG )
                print_exc()
            download_success = True
        else:
            log( "Path error", xbmc.LOGDEBUG )
            log( "    file path: %s" % repr( destination ), xbmc.LOGDEBUG )
            message = [ __language__(32026),  __language__(32025) , "File: %s" % get_unicode( path ), "Url: %s" % get_unicode( url_cdart ) ]
            #message = Download Problem, Check file paths - Artwork Not Downloaded]           
        # always cleanup downloaded files
        #if type == "fanart":
        delete_file( destination )
    except:
        log( "General download error", xbmc.LOGDEBUG )
        message = [ __language__(32026), __language__(32025), "File: %s" % get_unicode( path ), "Url: %s" % get_unicode( url_cdart ) ]
        #message = [Download Problem, Check file paths - Artwork Not Downloaded]           
        print_exc()
    if mode == "auto" or mode == "single":
        return message, download_success, final_destination, is_canceled  # returns one of the messages built based on success or lack of
    else:
        dialog_msg( "close", background = background )
        return message, download_success, is_canceled
Example #17
0
 def _report_hook( count, blocksize, totalsize ):
     try:
         percent = int( float( count * blocksize * 100 ) / totalsize )
         if percent < 1:
             percent = 1
         if percent > 100:
             percent = 100
     except:
         percent = 1
     if type in ( "fanart", "clearlogo", "artistthumb", "musicbanner" ):
         dialog_msg( "update", percent = percent, line1 = "%s%s" % ( __language__(32038) , get_unicode( album["artist"] ) ), background = background )
     else:
         dialog_msg( "update", percent = percent, line1 = "%s%s" % ( __language__(32038) , get_unicode( album["artist"] ) ), line2 = "%s%s" % ( __language__(32039) , get_unicode( album["title"] ) ), background = background )
     if mode == "auto":
         if dialog_msg( "iscanceled", background = background ):
             is_canceled = True  
Example #18
0
def auto_download( type, artist_list, background=False ):
    is_canceled = False
    log( "Autodownload", xbmc.LOGDEBUG )
    try:
        artist_count = 0
        download_count = 0
        cdart_existing = 0
        album_count = 0
        d_error=False
        percent = 1
        successfully_downloaded = []
        if type in ( "clearlogo_allartists", "artistthumb_allartists", "fanart_allartists", "musicbanner_allartists" ):
            if type == "clearlogo_allartists":
                type = "clearlogo"
            elif type == "artistthumb_allartists":
                type = "artistthumb"
            elif type == "musicbanner_allartists":
                type = "musicbanner"
            else:
                type = "fanart"
        count_artist_local = len( artist_list )
        dialog_msg( "create", heading = __language__(32046), background = background )
        #Onscreen Dialog - Automatic Downloading of Artwork
        key_label = type
        for artist in artist_list:
            low_res = True
            if dialog_msg( "iscanceled", background = background ) or is_canceled:
                is_canceled = True
                break
            artist_count += 1
            if not artist["has_art"] == "True":
            # If fanart.tv does not report that it has an artist match skip it.
                continue
            percent = int( (artist_count / float(count_artist_local) ) * 100)
            if percent < 1:
                percent = 1
            if percent > 100:
                percent = 100
            log( "Artist: %-40s Local ID: %-10s   Distant MBID: %s" % ( artist["name"], artist["local_id"], artist["musicbrainz_artistid"] ), xbmc.LOGNOTICE )
            if type in ( "fanart", "clearlogo", "artistthumb", "musicbanner" ) and artist[ "has_art" ]:
                dialog_msg( "update", percent = percent, line1 = "%s%s" % ( __language__(32038), get_unicode( artist["name"] ) ), background = background )
                auto_art = {}
                temp_art = {}
                temp_art["musicbrainz_artistid"] = artist["musicbrainz_artistid"]
                auto_art["musicbrainz_artistid"] = artist["musicbrainz_artistid"]
                temp_art["artist"] = artist["name"]
                auto_art["artist"] = artist["name"]
                path = os.path.join( music_path, change_characters( smart_unicode( artist["name"] ) ) )
                if type == "fanart":
                    art = remote_fanart_list( auto_art )
                elif type == "clearlogo":
                    art = remote_clearlogo_list( auto_art )
                    arthd = remote_hdlogo_list( auto_art )
                elif type == "musicbanner":
                    art = remote_banner_list( auto_art )
                else:
                    art = remote_artistthumb_list( auto_art )
                if art:
                    if type == "fanart":
                        temp_art["path"] = path
                        auto_art["path"] = os.path.join( path, "extrafanart" ).replace( "\\\\" , "\\" )
                        if not exists( auto_art["path"] ):
                            try:
                                if _makedirs( auto_art["path"] ):
                                    log( "extrafanart directory made", xbmc.LOGDEBUG )
                            except:
                                print_exc()
                                log( "unable to make extrafanart directory", xbmc.LOGDEBUG )
                                continue
                        else:
                            log( "extrafanart directory already exists", xbmc.LOGDEBUG )
                    else:
                        auto_art["path"] = path
                    if type == "fanart":
                        if enable_fanart_limit:
                            fanart_dir, fanart_files = listdir( auto_art["path"] )
                            fanart_number = len( fanart_files )
                            if fanart_number == fanart_limit:
                                continue
                        if not exists( os.path.join( path, "fanart.jpg" ).replace( "\\\\", "\\" ) ):
                            message, d_success, final_destination, is_canceled = download_art( art[0], temp_art, artist["local_id"], "fanart", "single", 0, background )
                        for artwork in art:
                            fanart = {}
                            if enable_fanart_limit and fanart_number == fanart_limit:
                                log( "Fanart Limit Reached", xbmc.LOGNOTICE )
                                continue
                            if exists( os.path.join( auto_art["path"], os.path.basename( artwork ) ) ):
                                log( "Fanart already exists, skipping", xbmc.LOGDEBUG )
                                continue
                            else:
                                message, d_success, final_destination, is_canceled = download_art( artwork, auto_art, artist["local_id"], "fanart", "auto", 0, background )
                            if d_success == 1:
                                if enable_fanart_limit:
                                    fanart_number += 1
                                download_count += 1
                                fanart["artist"] = auto_art["artist"]
                                fanart["path"] = final_destination
                                successfully_downloaded.append( fanart )
                            else:
                                log( "Download Error...  Check Path.", xbmc.LOGDEBUG )
                                log( "    Path: %s" % auto_art["path"], xbmc.LOGDEBUG )
                                d_error = True
                    else:
                        if type == "clearlogo":
                            if arthd and enable_hdlogos:
                                artwork = arthd[0]
                            else:
                                artwork = art[0]
                        else:
                            artwork = art[0]
                        if type == "artistthumb":
                            if resizeondownload:
                                low_res = check_size( auto_art["path"], key_label, 1000, 1000 )
                            # Fixed always redownloading Thumbs
                            else:
                                low_res = False
                            if exists( os.path.join( auto_art["path"], "folder.jpg" ) ) and not low_res:
                                log( "Artist Thumb already exists, skipping", xbmc.LOGDEBUG )
                                continue
                            else:
                                message, d_success, final_destination, is_canceled = download_art( artwork , auto_art, artist["local_id"], "artistthumb", "auto", 0, background )
                        elif type == "clearlogo":
                            if enable_hdlogos and resizeondownload and arthd:
                                low_res = check_size( auto_art["path"], key_label, 800, 310 )
                            else:
                                low_res = False
                            if exists( os.path.join( auto_art["path"], "logo.png" ) ) and not low_res:
                                log( "ClearLOGO already exists, skipping", xbmc.LOGDEBUG )
                                continue
                            else:
                                message, d_success, final_destination, is_canceled = download_art( artwork , auto_art, artist["local_id"], "clearlogo", "auto", 0, background )
                        elif type == "musicbanner":
                            if exists( os.path.join( auto_art["path"], "banner.jpg" ) ):
                                log( "Music Banner already exists, skipping", xbmc.LOGDEBUG )
                                continue
                            else:
                                message, d_success, final_destination, is_canceled = download_art( artwork , auto_art, artist["local_id"], "musicbanner", "auto", 0, background )
                        if d_success == 1:
                            download_count += 1
                            auto_art["path"] = final_destination
                            successfully_downloaded.append( auto_art )
                        else:
                            log( "Download Error...  Check Path.", xbmc.LOGDEBUG )
                            log( "    Path: %s" % auto_art["path"], xbmc.LOGDEBUG )
                            d_error = True
                else :
                        log( "Artist Match not found", xbmc.LOGDEBUG )
            elif type in ( "cdart", "cover" ) and artist[ "has_art" ]:
                local_album_list = get_local_albums_db( artist["name"], background )
                if type == "cdart":
                    remote_art_url = remote_cdart_list( artist )
                else:
                    remote_art_url = remote_coverart_list( artist )
                for album in local_album_list:
                    low_res = True
                    if dialog_msg( "iscanceled", background = background ):
                            break
                    if not remote_art_url:
                        log( "No artwork found", xbmc.LOGDEBUG )
                        break
                    album_count += 1
                    if not album["musicbrainz_albumid"]:
                        continue
                    dialog_msg( "update", percent = percent, line1 = "%s%s" % ( __language__(32038) , get_unicode( artist["name"] ) ), line2 = "%s%s" % (__language__(32039) , get_unicode( album["title"] ) ), background = background )
                    name = artist["name"]
                    title = album["title"]
                    log( "Album: %s" % album["title"], xbmc.LOGDEBUG )
                    if not album[key_label] or resizeondownload:
                        musicbrainz_albumid = album["musicbrainz_albumid"]
                        art = artwork_search( remote_art_url, musicbrainz_albumid, album["disc"], key_label )
                        if art:
                            if resizeondownload:
                                low_res = check_size( album["path"].replace( "\\\\", "\\" ), key_label, art["size"], art["size"] )
                            if art["picture"]: 
                                log( "ALBUM MATCH ON FANART.TV FOUND", xbmc.LOGDEBUG )
                                #log( "test_album[0]: %s" % test_album[0], xbmc.LOGDEBUG )
                                if low_res:
                                    message, d_success, final_destination, is_canceled = download_art( art["picture"], album, album["local_id"], key_label, "auto", 0, background )
                                    if d_success == 1:
                                        download_count += 1
                                        album[key_label] = True
                                        album["path"] = final_destination
                                        successfully_downloaded.append( album )
                                    else:
                                        log( "Download Error...  Check Path.", xbmc.LOGDEBUG )
                                        log( "    Path: %s" % repr( album["path"] ), xbmc.LOGDEBUG )
                                        d_error = True
                                else:
                                    pass
                            else:
                                log( "ALBUM NOT MATCHED ON FANART.TV", xbmc.LOGDEBUG )
                        else:
                            log( "ALBUM NOT MATCHED ON FANART.TV", xbmc.LOGDEBUG )
                    else:
                        log( "%s artwork file already exists, skipping..." % key_label, xbmc.LOGDEBUG )
        dialog_msg( "close", background = background )
        if d_error:
            dialog_msg( "ok", line1 = __language__(32026), line2 = "%s: %s" % ( __language__(32041), download_count ), background = background )
        else:
            dialog_msg( "ok", line1 = __language__(32040), line2 = "%s: %s" % ( __language__(32041), download_count ), background = background  )
        return download_count, successfully_downloaded
    except:
        print_exc()
        dialog_msg( "close", background = background )
Example #19
0
 def set_meta_img(self, src_url):
     self.meta_img = get_unicode(src_url)
     self.set_top_img_no_check(src_url)
Example #20
0
 def set_article_html(self, article_html):
     """Sets the HTML of just the article's `top_node`
     """
     self.article_html = get_unicode(article_html)
Example #21
0
 def set_html(self, html):
     """Encode HTML before setting it
     """
     self.is_downloaded = True
     self.html = get_unicode(html, is_html=True)
Example #22
0
 def set_text(self, text):
     text = text[:self.config.MAX_TEXT]
     self.text = get_unicode(text)
Example #23
0
    def __init__(self, url, title=u'', source_url=u'', config=None, **kwargs):
        """The **kwargs argument may be filled with config values, which
        is added into the config object
        """
        self.config = config or Configuration()
        self.config = extend_config(self.config, kwargs)

        self.extractor = ContentExtractor(self.config)

        if source_url == u'':
            source_url = urls.get_scheme(url) + '://' + urls.get_domain(url)

        if source_url is None or source_url == '':
            raise ArticleException('input url bad format')

        # URL to the main page of the news source which owns this article
        self.source_url = get_unicode(source_url)

        url = get_unicode(url)
        self.url = urls.prepare_url(url, self.source_url)

        self.title = get_unicode(title)

        # URL of the "best image" to represent this article
        self.top_img = self.top_image = u''

        # stores image provided by metadata
        self.meta_img = u''

        # All image urls in this article
        self.imgs = self.images = []

        # All videos in this article: youtube, vimeo, etc
        self.movies = []

        # Body text from this article
        self.text = u''

        # `keywords` are extracted via nlp() from the body text
        self.keywords = []

        # `meta_keywords` are extracted via parse() from <meta> tags
        self.meta_keywords = []

        # `tags` are also extracted via parse() from <meta> tags
        self.tags = set()

        # List of authors who have published the article, via parse()
        self.authors = []

        self.publish_date = u''

        # Summary generated from the article's body txt
        self.summary = u''

        # This article's unchanged and raw HTML
        self.html = u''

        # The HTML of this article's main node (most important part)
        self.article_html = u''

        # Flags warning users in-case they forget to download() or parse()
        # or if they call methods out of order
        self.is_parsed = False
        self.is_downloaded = False

        # Meta description field in the HTML source
        self.meta_description = u""

        # Meta language field in HTML source
        self.meta_lang = u""

        # Meta favicon field in HTML source
        self.meta_favicon = u""

        # Meta tags contain a lot of structured data, e.g. OpenGraph
        self.meta_data = {}

        # The canonical link of this article if found in the meta data
        self.canonical_link = u""

        # Holds the top element of the DOM that we determine is a candidate
        # for the main body of the article
        self.top_node = None

        # A deepcopied clone of the above object before heavy parsing
        # operations, useful for users to query data in the
        # "most important part of the page"
        self.clean_top_node = None

        # lxml DOM object generated from HTML
        self.doc = None

        # A deepcopied clone of the above object before undergoing heavy
        # cleaning operations, serves as an API if users need to query the DOM
        self.clean_doc = None

        # A property dict for users to store custom data.
        self.additional_data = {}
Example #24
0
def get_musicbrainz_album(album_title,
                          artist,
                          e_count,
                          limit=1,
                          with_singles=False,
                          by_release=False,
                          use_alias=False,
                          use_live=False):
    """ Retrieves information for Album from MusicBrainz using provided Album title and Artist name. 
        
        Use:
            album, albums = get_musicbrainz_album( album_title, artist, e_count, limit, with_singles, by_release )
        
        album_title  - the album title(must be unicode)
        artist       - the artist's name(must be unicode)
        e_count      - used internally(should be set to 0)
        limit        - limit the number of responses
        with_singles - set to True to look up single releases at the same time
        by_release   - use release name for search
    """
    match_within = "~2"
    album = {}
    albums = []
    count = e_count
    album["score"] = ""
    album["id"] = ""
    album["title"] = ""
    album["artist"] = ""
    album["artist_id"] = ""
    album_temp = smart_unicode(album_title)
    artist = smart_unicode(get_unicode(artist))
    album_title = smart_unicode(get_unicode(album_title))
    log("Artist: %s" % artist, xbmc.LOGDEBUG)
    log("Album: %s" % album_title, xbmc.LOGDEBUG)
    artist = artist.replace('"', '?')
    artist = artist.replace('&', 'and')
    album_title = album_title.replace('"', '?')
    album_title = album_title.replace('&', 'and')
    if limit == 1:
        if not use_alias:
            url = release_group_url_artist % (
                server, quote_plus(album_title.encode("utf-8")), match_within,
                quote_plus(artist.encode("utf-8")))
            if not with_singles and not by_release and not use_live:
                log(
                    "Retrieving MusicBrainz Info - Checking by Artist - Not including Singles or Live albums",
                    xbmc.LOGDEBUG)
                url = url + nolive_nosingles + query_limit % limit
            elif not with_singles and not by_release and use_live:
                log(
                    "Retrieving MusicBrainz Info - Checking by Artist - Not including Singles",
                    xbmc.LOGDEBUG)
                url = url + live_nosingles + query_limit % limit
            elif not by_release:
                log(
                    "Retrieving MusicBrainz Info - Checking by Artist - Including Singles and Live albums",
                    xbmc.LOGDEBUG)
                url = url + query_limit % limit
            elif not with_singles:
                log(
                    "Retrieving MusicBrainz Info - Checking by Artist - Using Release Name",
                    xbmc.LOGDEBUG)
                url = release_group_url_artist % (
                    server, quote_plus(
                        album_title.encode("utf-8")), match_within,
                    quote_plus(artist.encode("utf-8"))) + query_limit % limit
        elif use_alias:
            url = release_group_url_alias % (
                server, quote_plus(album_title.encode("utf-8")), match_within,
                quote_plus(artist.encode("utf-8")))
            if not with_singles and not by_release and not use_live:
                log(
                    "Retrieving MusicBrainz Info - Checking by Artist - Not including Singles or Live albums",
                    xbmc.LOGDEBUG)
                url = url + nolive_nosingles + query_limit % limit
            elif not with_singles and not by_release and use_live:
                log(
                    "Retrieving MusicBrainz Info - Checking by Artist - Not including Singles",
                    xbmc.LOGDEBUG)
                url = url + live_nosingles + query_limit % limit
            elif not by_release:
                log(
                    "Retrieving MusicBrainz Info - Checking by Artist - Including Singles and Live albums",
                    xbmc.LOGDEBUG)
                url = url + query_limit % limit
            elif not with_singles:
                log(
                    "Retrieving MusicBrainz Info - Checking by Artist - Using Release Name",
                    xbmc.LOGDEBUG)
                url = release_group_url_alias % (
                    server, quote_plus(
                        album_title.encode("utf-8")), match_within,
                    quote_plus(artist.encode("utf-8"))) + query_limit % limit
        htmlsource = get_html_source(url, "", save_file=False, overwrite=False)
        match = re.search(
            '''<release-group-list count="(?:.*?)" offset="(?:.*?)">(.*?)</release-group-list>''',
            htmlsource)
        if match:
            try:
                mbid = re.search('''<release-group id="(.*?)"(?:.*?)">''',
                                 htmlsource)
                if not mbid:
                    mbid = re.search('''<release-group (?:.*?)id="(.*?)">''',
                                     htmlsource)
                mbtitle = re.search('''<title>(.*?)</title>''', htmlsource)
                mbartist = re.search('''<name>(.*?)</name>''', htmlsource)
                mbartistid = re.search('''<artist id="(.*?)">''', htmlsource)
                album["id"] = mbid.group(1)
                album["title"] = unescape(smart_unicode(mbtitle.group(1)))
                album["artist"] = unescape(smart_unicode(mbartist.group(1)))
                album["artist_id"] = mbartistid.group(1)
            except:
                pass
        if not album["id"]:
            xbmc.sleep(mb_delay)  # sleep for allowing proper use of webserver
            if not with_singles and not by_release and not use_alias and not use_live:
                log(
                    "No releases found on MusicBrainz, Checking For Live Album",
                    xbmc.LOGDEBUG)
                album, albums = get_musicbrainz_album(
                    album_title, artist, 0, limit, False, False, False,
                    True)  # try again by using artist alias
            elif not with_singles and not by_release and not use_alias and use_live:
                log(
                    "No releases found on MusicBrainz, Checking by Artist Alias",
                    xbmc.LOGDEBUG)
                album, albums = get_musicbrainz_album(
                    album_title, artist, 0, limit, False, False, True,
                    False)  # try again by using artist alias
            elif use_alias and not with_singles and not by_release and not use_live:
                log(
                    "No releases found on MusicBrainz, Checking by Release Name",
                    xbmc.LOGDEBUG)
                album, albums = get_musicbrainz_album(
                    album_title, artist, 0, limit, False, True, False,
                    False)  # try again by using release name
            elif by_release and not with_singles and not use_alias:
                log(
                    "No releases found on MusicBrainz, Checking by Release name and Artist Alias",
                    xbmc.LOGDEBUG)
                album, albums = get_musicbrainz_album(
                    album_title, artist, 0, limit, False, True, True,
                    False)  # try again by using release name and artist alias
            elif by_release and not with_singles and use_alias:
                log("No releases found on MusicBrainz, checking singles",
                    xbmc.LOGDEBUG)
                album, albums = get_musicbrainz_album(
                    album_title, artist, 0, limit, True, False, False,
                    False)  # try again with singles
            elif with_singles and not use_alias and not by_release:
                log(
                    "No releases found on MusicBrainz, checking singles and Artist Alias",
                    xbmc.LOGDEBUG)
                album, albums = get_musicbrainz_album(
                    album_title, artist, 0, limit, True, False, True,
                    False)  # try again with singles and artist alias
            else:
                log("No releases found on MusicBrainz.", xbmc.LOGDEBUG)
                album["artist"], album[
                    "artist_id"], sort_name = get_musicbrainz_artist_id(artist)
    else:
        match_within = "~4"
        url = release_group_url_artist % (
            server, (album_title.encode("utf-8")), match_within,
            (artist.encode("utf-8"))) + query_limit % limit
        htmlsource = get_html_source(url, "", save_file=False, overwrite=False)
        match = re.search(
            '''<release-group-list count="(?:.*?)" offset="(?:.*?)">(.*?)</release-group-list>''',
            htmlsource)
        if match:
            match_release_group = re.findall(
                '''<release-group(.*?)</release-group>''', match.group(1))
            if match_release_group:
                for item in match_release_group:
                    album = {}
                    album["score"] = ""
                    album["id"] = ""
                    album["title"] = ""
                    album["artist"] = ""
                    album["artist_id"] = ""
                    try:
                        mbscore = re.search('''score="(.*?)"''', item)
                        mbid = re.search(
                            '''<release-group id="(.*?)"(?:.*?)">''', item)
                        if not mbid:
                            mbid = re.search('''id="(.*?)"(?:.*?)">''', item)
                            if not mbid:
                                mbid = re.search(
                                    '''<release-group (?:.*?)id="(.*?)">''',
                                    htmlsource)
                        mbtitle = re.search('''<title>(.*?)</title>''', item)
                        mbartist = re.search('''<name>(.*?)</name>''', item)
                        mbartistid = re.search('''<artist id="(.*?)">''', item)
                        album["score"] = mbscore.group(1)
                        album["id"] = mbid.group(1)
                        album["title"] = unescape(
                            smart_unicode(mbtitle.group(1)))
                        album["artist"] = unescape(
                            smart_unicode(mbartist.group(1)))
                        album["artist_id"] = mbartistid.group(1)
                        log("Score     : %s" % album["score"], xbmc.LOGDEBUG)
                        log("Title     : %s" % album["title"], xbmc.LOGDEBUG)
                        log("Id        : %s" % album["id"], xbmc.LOGDEBUG)
                        log("Artist    : %s" % album["artist"], xbmc.LOGDEBUG)
                        log("Artist ID : %s" % album["artist_id"],
                            xbmc.LOGDEBUG)
                        albums.append(album)
                    except:
                        print_exc()

            else:
                pass
        else:
            pass
    xbmc.sleep(mb_delay)  # sleep for allowing proper use of webserver
    return album, albums