Example #1
0
def get_archive_list(url):
    # grab our html
    lines = urlopen(url)
    html = ''.join(lines)
    soup = BS(massage_html(html))

    # classes:
    #  greybold = month / year (once per group)
    #  blackbasic = link
    classes = ['greybold','blackbasic']
    base_item = {'item_link_group_date':None}
    items = []
    for el in soup.findAll('td', {'class':lambda a: a in classes}):
        if el.get('class') == 'greybold':
            # it's the date
            base_item['item_link_group_date'] = _base_get_content(el)
        else:
            # could be a pic line or could be a normal
            item = copy(base_item)
            all_content = ''.join([unicode(x).lower() for x in el.contents])
            # if picture
            if 'in pictures:' in all_content:
                item.update(_get_in_pictures(el,url))
            # if feature
            elif 'feature:' in all_content:
                item.update(_get_feature(el,url))
            # if extra link
            elif '[pictures]' in all_content:
                item.update(_get_normal_with_extra_link(el,url))
            # if normal
            else:
                item.update(_get_normal(el,url))
#                if not item.get('item_link_heading'): continue
            if len(item) != 1: # must have @ least some data
                items.append(item)


    return items
def pull_item(url):
    # we are going to pull down the html for the item page
    # try and grab the relevant data + return back a dict

    if not url:
        return {}

    # grab the html
    try:
        lines = urlopen(url)
    except HTTPError, ex:
        print 'EXCEPTION:',url,str(ex)
        return {}

    html = massage_html(''.join(lines))
    try:
        soup = BS(html)
    except Exception, ex:
        print 'EXCEPTION:',url,len(html)
        #raise
        return {}

    # classes:
    #  blackbig = heading
    #  blackit = subheading # only the first is the subheading
    #  blackbasic = content
    #  blackitalic = author
    classes = [('td','blackbig','item_heading'),
               ('td','blackit','item_subheading'),
               ('td','blackbasic','item_content'),