def _get_normal_with_extra_link(el,url): content = _base_get_content(el) if not content: return {} return { 'item_link_heading': unicode(content), # in progress 'item_link_href': urljoin(url,el.contents[1].get('href') if hasattr(el.contents[1],'get') else unicode(el.contents[0].get('href'))), 'item_link_subheading': strip_tags(unicode(el.contents[3])).strip(), 'extra_link': unicode(el.contents[2]).strip() }
def get_archive_list(url): # grab our html lines = urlopen(url) html = ''.join(lines) soup = BS(massage_html(html)) # classes: # greybold = month / year (once per group) # blackbasic = link classes = ['greybold','blackbasic'] base_item = {'item_link_group_date':None} items = [] for el in soup.findAll('td', {'class':lambda a: a in classes}): if el.get('class') == 'greybold': # it's the date base_item['item_link_group_date'] = _base_get_content(el) else: # could be a pic line or could be a normal item = copy(base_item) all_content = ''.join([unicode(x).lower() for x in el.contents]) # if picture if 'in pictures:' in all_content: item.update(_get_in_pictures(el,url)) # if feature elif 'feature:' in all_content: item.update(_get_feature(el,url)) # if extra link elif '[pictures]' in all_content: item.update(_get_normal_with_extra_link(el,url)) # if normal else: item.update(_get_normal(el,url)) # if not item.get('item_link_heading'): continue if len(item) != 1: # must have @ least some data items.append(item) return items
def _get_normal(el,url): content = _base_get_content(el) if not content: return {} return { 'item_link_heading': content, 'item_link_href': urljoin(url,el.contents[1].get('href') if hasattr(el.contents[1],'get') else unicode(el.contents[0].get('href'))), 'item_link_subheading': strip_tags(unicode(el.contents[2])).strip() }