Ejemplo n.º 1
0
def get_book_image_files( url ):
    """
    get page filenames for the title
    """
    try:
        soup   = soupfy( url, encoding = 'shift_jis' )
        hrs    = soup.findAll( 'hr' )
        table  = hrs[ 1 ].next.next
        images = table.findAll( 'a' )
        files  = [ image[ 'href' ] for image in images ]
        return files
    except AttributeError:
        number = 1
        files  = []
        while True:
            page = soup.find( lambda tag : ( 'a' == tag.name and
                                             tag.string and
                                             '%dpage' % number == tag.string ) )
            if not page:
                break
            page    = page[ 'href' ]
            scheme, netloc, path, _, _, _ = urlparse( url )
            dirname = os.path.dirname( path )
            path    = '%s/%s' % ( dirname, page )
            url     = urlunparse( ( scheme, netloc, path, '', '', '' ) )
            files.extend( get_book_image_files( url ) )
            number += 1
        return files
    except:
        stacktrace = traceback.format_exc()
        ERRORS.append( stacktrace )
        return []
Ejemplo n.º 2
0
def get_new_books( url = URL ):
    """
    get { booktitle : url }
    """
    soup   = soupfy( url, encoding = 'shift_jis' )
    rows   = soup.findAll( 'tr', attrs = { 'align' : 'center' } )
    result = {}
    for i, row in enumerate( rows ):
        if 2 == i % 3:
            columns = row.findAll( 'td' )
            for column in columns:
                link  = column.find( 'a' )
                url   = link[ 'href' ]
                title = link.contents[ 0 ]
                try:
                    result[ clean_title( title ) ] = url
                except TypeError:
                    title = title.string
                    result[ clean_title( title ) ] = url
    return result