def _extract_feed(self): posts = self.soup.findAll('div', attrs={'class':'post'}) stories = [] for p in posts: try: t = p.find('h1') d = p.find('small') # Contents are included in parent... date = None author = None contents = '' is_summary = False # TODO uri = t.a['href'] title = t.a.text # Extract date and author for tag in d: try: if tag.name == 'a' and tag['title'].startswith('Posts'): author = tag.text continue except: pass try: tag = tag.replace('|', '').strip() tag = remove_ordinal(tag) tag = datetime.strptime(tag, '%B %d, %Y') date = tag except: pass # Extract contents for t in p: try: if t.name in ['h1', 'small', 'hr']: continue except: pass contents += unicode(t).strip() stories.append({ 'uri': uri, 'title': title, 'date': date, 'author': author, 'contents': contents }) except Exception as e: print e continue return stories
def _extract_feed(self): posts = self.soup.findAll('div', attrs={'class':'storyblock'}) stories = [] for p in posts: try: h = p.find('h1') d = p.find('p', attrs={'class':'storydate'}) cts = p.find('div', attrs={'class':'story'}) title = h.a.text link = 'http://techdirt.com' + h.a['href'] date = remove_ordinal(d.text) date = datetime.strptime(date, "%a, %b %d %Y %I:%M%p") contents = '' for t in cts.contents: try: if t.name in ['h1', 'h3']: continue if t['style']: continue except: pass contents += unicode(t).strip() stories.append({ 'uri': link, 'title': title, 'date': date, 'contents': contents }) except Exception as e: raise e continue return stories
def _extract_feed(self): posts = self.soup.findAll('div', id=re.compile('post-\d+')) stories = [] for p in posts: try: a = p.find('a', attrs={'rel':re.compile('bookmark')}) d = p.find('div', attrs={'class':'post-details'}) cts = p.findAll('p') # Lack of semantics is why we can't have nice things... date = None time = None for x in d: try: x = x.strip() except: continue x = x.replace('|', '').strip() # TODO: Really need to write a date parser module x = remove_ordinal(x) parse_date = None parse_time = None try: parse_date = datetime.strptime(x, "%A, %B %d, %Y") except Exception as e: pass try: parse_time = datetime.strptime(x, "%I:%M %p") except: pass if parse_date: date = parse_date if parse_time: time = parse_time title = a.text link = a.attrMap['href'] try: date = datetime.combine(date, time.time()) except Exception as e: pass #print e # extraction of text contents, again *semantics* contents = '' for t in cts: contents += unicode(t).strip() stories.append({ 'uri': link, 'title': title, 'date': date, 'contents': contents }) except Exception as e: print e continue return stories