def parse_content(url, title=None): """http://news.qq.com/a/20121120/002046.htm""" page = dwutil.downloadPage(url) if not page: print "failed to downlod url '%s'" % (url,) return try: page = page.decode('gb18030') except: print 'warn: failed to decode page: %s' % (url,) return soup = BeautifulSoup(page, 'lxml') #remove script and style tags for elem in soup.findAll(['script','style']): elem.extract() if not title: ttag = soup.find('h1') if not ttag: return None title = ttag.text.strip() ctag = soup.find(id='Cnt-Main-Article-QQ') if ctag == None: print "failed to extract content from '%s'" % (url,) return None plist = [] ptags = ctag.find_all('p') #print ptags for p in ptags: p = p.text.strip() if p: plist.append(p) if not plist: print 'can not find paragraph in content page: %s' % url return None content = '\n'.join(plist) return (content, title)
def parse_link(url, title=None): """http://news.qq.com/a/20121120/002046.htm""" page = dwutil.downloadPage(url) if not page: print "failed to downlod url '%s'" % (url, ) return None page = page.decode('gb18030', 'ignore') soup = bs4.BeautifulSoup(page, 'lxml') #remove script and style tags for elem in soup.findAll(['script', 'style']): elem.extract() if not title: ttag = soup.find('h1') if not ttag: return None title = ttag.text.strip() pub_time = soup.find('span', class_=re.compile('pubTime|article-time')) if not pub_time: print 'warn: %s has no pubtime' % url return None pub_time = pub_time.text ctag = soup.find(id='Cnt-Main-Article-QQ') if ctag == None: print "failed to extract content from '%s'" % (url, ) return None plist = [] ptags = ctag.find_all('p') #print ptags for p in ptags: p = p.text.strip() if p: plist.append(p) if not plist: print 'can not find paragraph in content page: %s' % url return None content = '\n'.join(plist) return (content, title, pub_time)
def parse_content(url, title=None): """http://news.qq.com/a/20121120/002046.htm""" page = dwutil.downloadPage(url) if not page: print "failed to downlod url '%s'" % (url, ) return try: page = page.decode('gb18030') except: print 'warn: failed to decode page: %s' % (url, ) return soup = BeautifulSoup(page, 'lxml') #remove script and style tags for elem in soup.findAll(['script', 'style']): elem.extract() if not title: ttag = soup.find('h1') if not ttag: return None title = ttag.text.strip() ctag = soup.find(id='Cnt-Main-Article-QQ') if ctag == None: print "failed to extract content from '%s'" % (url, ) return None plist = [] ptags = ctag.find_all('p') #print ptags for p in ptags: p = p.text.strip() if p: plist.append(p) if not plist: print 'can not find paragraph in content page: %s' % url return None content = '\n'.join(plist) return (content, title)
def parse_link(url, title=None): """http://news.qq.com/a/20121120/002046.htm""" page = dwutil.downloadPage(url) if not page: print "failed to downlod url '%s'" % (url,) return None page = page.decode('gb18030','ignore') soup = bs4.BeautifulSoup(page, 'lxml') #remove script and style tags for elem in soup.findAll(['script','style']): elem.extract() if not title: ttag = soup.find('h1') if not ttag: return None title = ttag.text.strip() pub_time = soup.find('span', class_=re.compile('pubTime|article-time')) if not pub_time: print 'warn: %s has no pubtime' % url return None pub_time = pub_time.text ctag = soup.find(id='Cnt-Main-Article-QQ') if ctag == None: print "failed to extract content from '%s'" % (url,) return None plist = [] ptags = ctag.find_all('p') #print ptags for p in ptags: p = p.text.strip() if p: plist.append(p) if not plist: print 'can not find paragraph in content page: %s' % url return None content = '\n'.join(plist) return (content, title,pub_time)
def get_urllist(curl): page = dwutil.downloadPage(curl) page = page.decode('gb18030', 'ignore') urlmatcher = re.compile(r'^http://(\w+\.)+qq\.com/\w+/\d+/\d+\.htm$') soup = bs4.BeautifulSoup(page) alist = soup.find_all('a', href=urlmatcher) if len(alist) == 0: print 'failed to find urllist from %s' % (curl, ) else: for a in alist: link, title = a['href'], a.text.strip() yield (link, title)
def get_urllist(curl): page = dwutil.downloadPage(curl) page = page.decode('gb18030','ignore') urlmatcher = re.compile(r'^http://(\w+\.)+qq\.com/\w+/\d+/\d+\.htm$') soup = bs4.BeautifulSoup(page) alist = soup.find_all('a', href=urlmatcher) if len(alist) == 0: print 'failed to find urllist from %s' % (curl,) else: for a in alist: link,title = a['href'], a.text.strip() yield (link, title)
def get_topiclist(url): """http://news.qq.com/topic/gnzt.htm """ page = dwutil.downloadPage(url) page = page.decode('gb18030') soup = bs4.BeautifulSoup(page, 'lxml') alist = soup.find_all('a', class_='black linkcss fsize14') for a in alist: u = a['href'] i = u.index('zt/') u = u[:i + 2] + u[i + 3:] p = dwutil.downloadPage(u) p = p.decode('gb18030') soup2 = bs4.BeautifulSoup(p, 'lxml') turl = soup2.find('a', text=u'最新消息') if turl == None: continue link = turl['href'] date = a.next_sibling.text # remove the '(' and ')' in (2013年02月01日) date = date[1:-1] tname = a.text yield (link, tname, date)
def get_topiclist(url): """http://news.qq.com/topic/gnzt.htm """ page = dwutil.downloadPage(url) page = page.decode('gb18030') soup = bs4.BeautifulSoup(page,'lxml') alist = soup.find_all('a', class_='black linkcss fsize14') for a in alist: u = a['href'] i = u.index('zt/') u = u[:i+2]+u[i+3:] p = dwutil.downloadPage(u) p = p.decode('gb18030') soup2 = bs4.BeautifulSoup(p,'lxml') turl = soup2.find('a', text=u'最新消息') if turl == None: continue link = turl['href'] date = a.next_sibling.text # remove the '(' and ')' in (2013年02月01日) date = date[1:-1] tname = a.text yield (link,tname,date)
def get_urllist(topicUrl, topicname=None, maxnumofpage=0): """ http://news.qq.com/l/13532840273/list_13532840273.htm """ curl = topicUrl urlmatcher = re.compile(r'^http://news\.qq\.com/\w+/\d+/\d+\.htm$') nexturlmatcher = re.compile(u'^обр╩рЁ') numofpage = 0 while curl != None: page = dwutil.downloadPage(curl) if page == None: break page = page.decode('gb18030') soup = BeautifulSoup(page) if not topicname: topicname = soup.find('title').text.strip() si = topicname.find('_')+1 ei = topicname.find('_', si) topicname = topicname[si:ei] alist = soup.find_all('a',href=urlmatcher) if len(alist) == 0: print 'failed to find urllist from %s' % (curl,) else: for a in alist: link,title = a['href'], a.text.strip() pubtime = a.find_next_sibling('span') if pubtime: pubtime = pubtime.text.strip() yield (link, title, pubtime, topicname) numofpage += 1 if maxnumofpage>0 and numofpage>maxnumofpage: break #find next page url anext = soup.find('a', text=nexturlmatcher) #next page curl = anext['href'] if anext!=None else None print "navigated %d list pages" % numofpage