Python downloadPageの例、dwutil.downloadPage Pythonの例

コード例 #1

0

ファイルを表示

ファイル: content.py プロジェクト: Calvin-he/docclustering

def parse_content(url, title=None):
    """http://news.qq.com/a/20121120/002046.htm"""
    page = dwutil.downloadPage(url)
    if not page:
        print "failed to downlod url '%s'" % (url,)  
        return
    try:
        page = page.decode('gb18030')
    except:
        print 'warn: failed to decode page: %s' % (url,)
        return 
    soup = BeautifulSoup(page, 'lxml')
    #remove script and style tags
    for elem in soup.findAll(['script','style']):
        elem.extract()
    if not title:
        ttag = soup.find('h1')
        if not ttag: return None
        title = ttag.text.strip()

    ctag = soup.find(id='Cnt-Main-Article-QQ')
    if ctag == None:
        print "failed to extract content from '%s'" % (url,)  
        return None
    plist = []
    ptags = ctag.find_all('p')
    #print ptags
    for p in ptags:
        p = p.text.strip()
        if p: plist.append(p)
    if not plist:
        print 'can not find paragraph in content page: %s' % url
        return None
    content = '\n'.join(plist)
    return (content, title)

コード例 #2

0

ファイルを表示

def parse_link(url, title=None):
    """http://news.qq.com/a/20121120/002046.htm"""
    page = dwutil.downloadPage(url)
    if not page:
        print "failed to downlod url '%s'" % (url, )
        return None
    page = page.decode('gb18030', 'ignore')
    soup = bs4.BeautifulSoup(page, 'lxml')
    #remove script and style tags
    for elem in soup.findAll(['script', 'style']):
        elem.extract()
    if not title:
        ttag = soup.find('h1')
        if not ttag: return None
        title = ttag.text.strip()
    pub_time = soup.find('span', class_=re.compile('pubTime|article-time'))
    if not pub_time:
        print 'warn: %s has no pubtime' % url
        return None
    pub_time = pub_time.text
    ctag = soup.find(id='Cnt-Main-Article-QQ')
    if ctag == None:
        print "failed to extract content from '%s'" % (url, )
        return None
    plist = []
    ptags = ctag.find_all('p')
    #print ptags
    for p in ptags:
        p = p.text.strip()
        if p: plist.append(p)
    if not plist:
        print 'can not find paragraph in content page: %s' % url
        return None
    content = '\n'.join(plist)
    return (content, title, pub_time)

コード例 #3

0

ファイルを表示

ファイル: content.py プロジェクト: piaofu110/docclustering

def parse_content(url, title=None):
    """http://news.qq.com/a/20121120/002046.htm"""
    page = dwutil.downloadPage(url)
    if not page:
        print "failed to downlod url '%s'" % (url, )
        return
    try:
        page = page.decode('gb18030')
    except:
        print 'warn: failed to decode page: %s' % (url, )
        return
    soup = BeautifulSoup(page, 'lxml')
    #remove script and style tags
    for elem in soup.findAll(['script', 'style']):
        elem.extract()
    if not title:
        ttag = soup.find('h1')
        if not ttag: return None
        title = ttag.text.strip()

    ctag = soup.find(id='Cnt-Main-Article-QQ')
    if ctag == None:
        print "failed to extract content from '%s'" % (url, )
        return None
    plist = []
    ptags = ctag.find_all('p')
    #print ptags
    for p in ptags:
        p = p.text.strip()
        if p: plist.append(p)
    if not plist:
        print 'can not find paragraph in content page: %s' % url
        return None
    content = '\n'.join(plist)
    return (content, title)

コード例 #4

0

ファイルを表示

ファイル: noise.py プロジェクト: Calvin-he/docclustering

def parse_link(url, title=None):
    """http://news.qq.com/a/20121120/002046.htm"""
    page = dwutil.downloadPage(url)
    if not page:
        print "failed to downlod url '%s'" % (url,)  
        return None
    page = page.decode('gb18030','ignore')
    soup = bs4.BeautifulSoup(page, 'lxml')
    #remove script and style tags
    for elem in soup.findAll(['script','style']):
        elem.extract()
    if not title:
        ttag = soup.find('h1')
        if not ttag: return None
        title = ttag.text.strip()
    pub_time = soup.find('span', class_=re.compile('pubTime|article-time'))
    if not pub_time:
        print 'warn: %s has no pubtime' % url
        return None
    pub_time = pub_time.text
    ctag = soup.find(id='Cnt-Main-Article-QQ')
    if ctag == None:
        print "failed to extract content from '%s'" % (url,)  
        return None
    plist = []
    ptags = ctag.find_all('p')
    #print ptags
    for p in ptags:
        p = p.text.strip()
        if p: plist.append(p)
    if not plist:
        print 'can not find paragraph in content page: %s' % url
        return None
    content = '\n'.join(plist)
    return (content, title,pub_time)

コード例 #5

0

ファイルを表示

def get_urllist(curl):
    page = dwutil.downloadPage(curl)
    page = page.decode('gb18030', 'ignore')
    urlmatcher = re.compile(r'^http://(\w+\.)+qq\.com/\w+/\d+/\d+\.htm$')
    soup = bs4.BeautifulSoup(page)
    alist = soup.find_all('a', href=urlmatcher)

    if len(alist) == 0:
        print 'failed to find urllist from %s' % (curl, )
    else:
        for a in alist:
            link, title = a['href'], a.text.strip()
            yield (link, title)

コード例 #6

0

ファイルを表示

ファイル: noise.py プロジェクト: Calvin-he/docclustering

def get_urllist(curl):
    page = dwutil.downloadPage(curl)
    page = page.decode('gb18030','ignore')   
    urlmatcher = re.compile(r'^http://(\w+\.)+qq\.com/\w+/\d+/\d+\.htm$')
    soup = bs4.BeautifulSoup(page)
    alist = soup.find_all('a', href=urlmatcher)
    
    if len(alist) == 0:
        print 'failed to find urllist from %s' % (curl,)
    else:
        for a in alist:
            link,title = a['href'], a.text.strip()
            yield (link, title)

コード例 #7

0

ファイルを表示

ファイル: topiclist.py プロジェクト: piaofu110/docclustering

def get_topiclist(url):
    """http://news.qq.com/topic/gnzt.htm """
    page = dwutil.downloadPage(url)
    page = page.decode('gb18030')

    soup = bs4.BeautifulSoup(page, 'lxml')
    alist = soup.find_all('a', class_='black linkcss fsize14')
    for a in alist:
        u = a['href']
        i = u.index('zt/')
        u = u[:i + 2] + u[i + 3:]
        p = dwutil.downloadPage(u)
        p = p.decode('gb18030')
        soup2 = bs4.BeautifulSoup(p, 'lxml')

        turl = soup2.find('a', text=u'最新消息')
        if turl == None: continue
        link = turl['href']

        date = a.next_sibling.text
        # remove the '(' and ')' in (2013年02月01日)
        date = date[1:-1]
        tname = a.text
        yield (link, tname, date)

コード例 #8

0

ファイルを表示

ファイル: topiclist.py プロジェクト: Calvin-he/docclustering

def get_topiclist(url):
    """http://news.qq.com/topic/gnzt.htm """
    page = dwutil.downloadPage(url)
    page = page.decode('gb18030')

    soup = bs4.BeautifulSoup(page,'lxml')
    alist = soup.find_all('a', class_='black linkcss fsize14')
    for a in alist:
        u =  a['href']
        i = u.index('zt/')
        u = u[:i+2]+u[i+3:]
        p = dwutil.downloadPage(u)
        p = p.decode('gb18030')
        soup2 = bs4.BeautifulSoup(p,'lxml')
        
        turl = soup2.find('a', text=u'最新消息')
        if turl == None: continue
        link = turl['href']
        
        date = a.next_sibling.text
        # remove the '(' and ')' in (2013年02月01日)
        date = date[1:-1]
        tname = a.text
        yield (link,tname,date)

コード例 #9

0

ファイルを表示

ファイル: urllist.py プロジェクト: piaofu110/docclustering

def get_urllist(topicUrl, topicname=None, maxnumofpage=0):
    """
    http://news.qq.com/l/13532840273/list_13532840273.htm
    """
    curl = topicUrl
    urlmatcher = re.compile(r'^http://news\.qq\.com/\w+/\d+/\d+\.htm$')
    nexturlmatcher = re.compile(u'^обр╩рЁ')
    numofpage = 0
    while curl != None:
        page = dwutil.downloadPage(curl)
        if page == None: break
        page = page.decode('gb18030')
        soup = BeautifulSoup(page)

        if not topicname:
            topicname = soup.find('title').text.strip()
            si = topicname.find('_')+1
            ei = topicname.find('_', si)
            topicname = topicname[si:ei]

        alist = soup.find_all('a',href=urlmatcher)
        if len(alist) == 0:
            print 'failed to find urllist from %s' % (curl,)
        else:
            for a in alist:
                link,title = a['href'], a.text.strip()
                pubtime = a.find_next_sibling('span')
                if pubtime:
                    pubtime = pubtime.text.strip()
                    yield (link, title, pubtime, topicname)
        
        numofpage += 1
        if maxnumofpage>0 and numofpage>maxnumofpage: break
        #find next page url
        anext = soup.find('a', text=nexturlmatcher) #next page
        curl = anext['href'] if anext!=None else None
    
    print "navigated %d list pages" % numofpage