def q(e): if e: return e.text else: return e file = '/Users/ilya/Desktop/cl_summary.txt' page = open(file, 'r') soup = BeautifulSoup(page, convertEntities=BeautifulSoup.HTML_ENTITIES) # print soup lst = [(p.a.text, p.a['href'], q(p.find('font')), p.small.a.text, p.small.a['href']) for p in soup.blockquote(lambda tag: tag.name == 'p' and not tag.attrs, recursive=False)] print lst exit(1) span = pdtl[0].span.text a = pdtl[0].find('a') # print span print a.contents[0].strip() print a.contents[1].text # print dir(a) exit(1) print[e.text for e in siteMap.findAll('h3') if e.findNext().name == 'a'] print[(e.name, e['class'])
from BeautifulSoup import BeautifulSoup def q(e): if e: return e.text else: return e file = '/Users/ilya/Desktop/cl_summary.txt' page = open(file, 'r') soup = BeautifulSoup(page, convertEntities=BeautifulSoup.HTML_ENTITIES) # print soup lst = [(p.a.text, p.a['href'], q(p.find('font')), p.small.a.text, p.small.a['href']) for p in soup.blockquote(lambda tag: tag.name == 'p' and not tag.attrs, recursive=False)] print lst exit(1) span = pdtl[0].span.text a = pdtl[0].find('a') # print span print a.contents[0].strip() print a.contents[1].text # print dir(a) exit(1) print [e.text for e in siteMap.findAll('h3') if e.findNext().name == 'a'] print [(e.name, e['class']) for e in siteMap.findAll('div', 'content-group')[0].findAll(recursive=False)] # , attrs={'class':'sublist'}) print [e.text for e in siteMap.findAll('div', 'content-group')[0].findAll(recursive=False)[0] if e.findNext().name == 'a']