def get_archive_list(url): # grab our html lines = urlopen(url) html = ''.join(lines) soup = BS(massage_html(html)) # classes: # greybold = month / year (once per group) # blackbasic = link classes = ['greybold','blackbasic'] base_item = {'item_link_group_date':None} items = [] for el in soup.findAll('td', {'class':lambda a: a in classes}): if el.get('class') == 'greybold': # it's the date base_item['item_link_group_date'] = _base_get_content(el) else: # could be a pic line or could be a normal item = copy(base_item) all_content = ''.join([unicode(x).lower() for x in el.contents]) # if picture if 'in pictures:' in all_content: item.update(_get_in_pictures(el,url)) # if feature elif 'feature:' in all_content: item.update(_get_feature(el,url)) # if extra link elif '[pictures]' in all_content: item.update(_get_normal_with_extra_link(el,url)) # if normal else: item.update(_get_normal(el,url)) # if not item.get('item_link_heading'): continue if len(item) != 1: # must have @ least some data items.append(item) return items
def pull_item(url): # we are going to pull down the html for the item page # try and grab the relevant data + return back a dict if not url: return {} # grab the html try: lines = urlopen(url) except HTTPError, ex: print 'EXCEPTION:',url,str(ex) return {} html = massage_html(''.join(lines)) try: soup = BS(html) except Exception, ex: print 'EXCEPTION:',url,len(html) #raise return {} # classes: # blackbig = heading # blackit = subheading # only the first is the subheading # blackbasic = content # blackitalic = author classes = [('td','blackbig','item_heading'), ('td','blackit','item_subheading'), ('td','blackbasic','item_content'),