コード例 #1
0
def main():
    url = 'http://wenku.baidu.com/portal/subject/8_s0_g0_v0'

    mCGetResourcesHref = CGetResourcesHref()
    soup = mCGetResourcesHref.getSoup(url)

    Hrefs = set()  #set([href, ...])
    #Get the subject hrefs.
    subjectHrefs = mCGetResourcesHref.getSubject(soup)
    Hrefs |= subjectHrefs
    url = 'http://wenku.baidu.com'
    for s_href, svg in subjectHrefs:
        soup = mCGetResourcesHref.getSoup(url + s_href)
        versionHrefs = mCGetResourcesHref.getVersion(soup)
        Hrefs |= versionHrefs
        for v_href, svg in versionHrefs:
            soup = mCGetResourcesHref.getSoup(url + v_href)
            gradeHrefs = mCGetResourcesHref.getGrade(soup)
            Hrefs |= gradeHrefs

    svgHrefs = mCGetResourcesHref.getSVG(soup)
    lessonHrefs = mCGetResourcesHref.getLessons(soup)
    url = 'http://wenku.baidu.com'
    for svg_href in svgHrefs:
        temp = url + svg_href
        time.sleep(3)
        mSourceCode = getUrlSource(temp)
        soup = BeautifulSoup(mSourceCode)
        svg = mCGetResourcesHref.getSelectSVG(soup)
        lessonHrefs = mCGetResourcesHref.getLessons(soup)
        for lesson_href, lesson_title in lessonHrefs:
            print temp + lesson_href, ''.join(svg), lesson_title
コード例 #2
0
 def getSoup(self, url):
     '''
     Get the source code.
     '''
     html = getUrlSource(url)
     if len(html) == 0: return False
     soup = BeautifulSoup(html)
     return soup
コード例 #3
0
def getSoup(url):
    num = 0
    while True:
        time.sleep(1)
        html = getUrlSource(url)
        if len(html) != 0 or num >20:
            break
    soup = BeautifulSoup(html)
    return soup
コード例 #4
0
def getSoup(url):
    num = 0
    while True:
        time.sleep(1)
        html = getUrlSource(url)
        if len(html) != 0 or num > 20:
            break
    soup = BeautifulSoup(html)
    return soup
コード例 #5
0
def getSoup(url):
    num = 0
    while True:
        time.sleep(2)
        html = getUrlSource(url)
        if len(html) != 0:
            print 'Right:\t', url
            break
        elif num > 20:
            print 'Wrong:\t', url
            break
    soup = BeautifulSoup(html)
    return soup
コード例 #6
0
    def getContentLXML(self, url):
        ''''''
        num = 1
        while True:
            mSourceCode = getUrlSource(url)
            if len(mSourceCode) != 0:
                break
            elif len(mSourceCode) == 0:
                time.sleep(20)
                num += 1
            elif num > 10:
                return False
        page = etree.HTML(mSourceCode)
        titleModules = page.xpath("//div[@class='at_c']")
        #Read title
        if len(titleModules):
            title = titleModules[0].text
        else:
            title = ''

        #Read Content
        content = ''
        try:
            num = 1
            while True:
                mContentModules = page.xpath(
                    "//body/div[@class='ptb45 bgcolor1 xreader']")
                for cm in mContentModules:
                    temp = "".join([x for x in cm.itertext()])
                    content += temp
                num += 1
                if num > 200: break
                href = self.nextpage(url, num)
                mSourceCode = getUrlSource(href)
                if len(mSourceCode) == 0: break
                page = etree.HTML(mSourceCode)
        except:
            print 'Wrong contentAnalysis: ', url
        return {'title': title, 'content': content}
コード例 #7
0
    def getContentBS(self, url):
        time.sleep(2)
        num = 1
        while True:
            mSourceCode = getUrlSource(url)
            if len(mSourceCode) != 0:
                break
            elif len(mSourceCode) == 0:
                time.sleep(20)
                num += 1
            elif num > 10:
                return False
        soup = BeautifulSoup(mSourceCode)
        #Read title
        title = soup.find("div", {"class": "at_c"})
        title = '' if title is None else title.text

        #Read Content
        content = ''
        try:
            num = 1
            while True:
                temp = soup.find("div", {"class": "ptb45 bgcolor1 xreader"})
                if temp is None: break
                content = '%s%s' % (content, temp.text)
                num += 1
                if num > 200:
                    break
                href = self.nextpage(url, num)
                html = getUrlSource(href)
                if len(html) == 0:
                    break
                soup = BeautifulSoup(html)
        except:
            print 'Wrong contentAnalysis: ', url
        return {'title': title, 'content': content}
コード例 #8
0
ファイル: Baike.py プロジェクト: fangzheng354/BaiduCrawler
def getSoup(url):
    num = 0
    while True:
        html = getUrlSource(url)
        if len(html) != 0:
            print 'Right:\t', url
            break
        elif num <=5:
            time.sleep(5)
            num += 1
        elif num > 5:
            print 'Wrong:\t', url
            break
    soup = BeautifulSoup(html)
    return soup