def get_book_image_files( url ): """ get page filenames for the title """ try: soup = soupfy( url, encoding = 'shift_jis' ) hrs = soup.findAll( 'hr' ) table = hrs[ 1 ].next.next images = table.findAll( 'a' ) files = [ image[ 'href' ] for image in images ] return files except AttributeError: number = 1 files = [] while True: page = soup.find( lambda tag : ( 'a' == tag.name and tag.string and '%dpage' % number == tag.string ) ) if not page: break page = page[ 'href' ] scheme, netloc, path, _, _, _ = urlparse( url ) dirname = os.path.dirname( path ) path = '%s/%s' % ( dirname, page ) url = urlunparse( ( scheme, netloc, path, '', '', '' ) ) files.extend( get_book_image_files( url ) ) number += 1 return files except: stacktrace = traceback.format_exc() ERRORS.append( stacktrace ) return []
def get_new_books( url = URL ): """ get { booktitle : url } """ soup = soupfy( url, encoding = 'shift_jis' ) rows = soup.findAll( 'tr', attrs = { 'align' : 'center' } ) result = {} for i, row in enumerate( rows ): if 2 == i % 3: columns = row.findAll( 'td' ) for column in columns: link = column.find( 'a' ) url = link[ 'href' ] title = link.contents[ 0 ] try: result[ clean_title( title ) ] = url except TypeError: title = title.string result[ clean_title( title ) ] = url return result