def _get_normal_with_extra_link(el,url): content = _base_get_content(el) if not content: return {} return { 'item_link_heading': unicode(content), # in progress 'item_link_href': urljoin(url,el.contents[1].get('href') if hasattr(el.contents[1],'get') else unicode(el.contents[0].get('href'))), 'item_link_subheading': strip_tags(unicode(el.contents[3])).strip(), 'extra_link': unicode(el.contents[2]).strip() }
def _get_content_item_content(elements): # the content is spread out through many elements # different elements = line breaks # br = line breaks to_return = [] for el in elements: for content in el.contents: # we want the string, not the bs from soup content = unicode(content) # we don't want blank lines, or bullshit, bullshit = all caps? if content.upper() == content: continue if content.strip() == '': continue # don't need blank lines # don't need non-content bs, which means if we start w/ # a bad tag, rejected good_tags = ['p','b'] if content[0] == '<' and content[1] not in good_tags: continue if ''.join(content.strip().split()).lower() == '<br/>': # this is a line break to_return.append('') else: to_return.append(strip_tags(content).strip()) return "\n".join(to_return)
def _get_content_item_author(elements): if not elements: return '' return strip_tags(str(elements[0].contents[1])).strip()
# classes: # blackbig = heading # blackit = subheading # only the first is the subheading # blackbasic = content # blackitalic = author classes = [('td','blackbig','item_heading'), ('td','blackit','item_subheading'), ('td','blackbasic','item_content'), ('div','blackitalic','item_author')] # we are going to pull the item's info item = {'item_url':url} function_base = '_get_content_' for t,c,n in classes: elements = soup.findAll(t,{'class':c}) fn = globals().get(function_base+n) or globals().get(function_base) content = fn(elements) item[n] = content # there might be articles with multiple pages page = 1 if 'Page %s' in html: # follow to the next page and grab the content pass # if the item doesn't have an author, than the tag is probably malformed if not item.get('item_author'): item['item_author'] = strip_tags(re.findall(r'(>-.*</)',html)[0])[3:-2].strip() return item
def _get_feature(el,url): return { 'is_feature':1, 'item_link_href': urljoin(url,el.contents[3].get('href')), 'item_link_subheading': unicode(el.contents[4].strip()), 'item_link_heading': strip_tags(str(el.contents[3]).strip()), 'item_link_date': strip_tags(str(el.contents[4])).strip() }
def _get_normal(el,url): content = _base_get_content(el) if not content: return {} return { 'item_link_heading': content, 'item_link_href': urljoin(url,el.contents[1].get('href') if hasattr(el.contents[1],'get') else unicode(el.contents[0].get('href'))), 'item_link_subheading': strip_tags(unicode(el.contents[2])).strip() }