def main(): url = 'http://wenku.baidu.com/portal/subject/8_s0_g0_v0' mCGetResourcesHref = CGetResourcesHref() soup = mCGetResourcesHref.getSoup(url) Hrefs = set() #set([href, ...]) #Get the subject hrefs. subjectHrefs = mCGetResourcesHref.getSubject(soup) Hrefs |= subjectHrefs url = 'http://wenku.baidu.com' for s_href, svg in subjectHrefs: soup = mCGetResourcesHref.getSoup(url + s_href) versionHrefs = mCGetResourcesHref.getVersion(soup) Hrefs |= versionHrefs for v_href, svg in versionHrefs: soup = mCGetResourcesHref.getSoup(url + v_href) gradeHrefs = mCGetResourcesHref.getGrade(soup) Hrefs |= gradeHrefs svgHrefs = mCGetResourcesHref.getSVG(soup) lessonHrefs = mCGetResourcesHref.getLessons(soup) url = 'http://wenku.baidu.com' for svg_href in svgHrefs: temp = url + svg_href time.sleep(3) mSourceCode = getUrlSource(temp) soup = BeautifulSoup(mSourceCode) svg = mCGetResourcesHref.getSelectSVG(soup) lessonHrefs = mCGetResourcesHref.getLessons(soup) for lesson_href, lesson_title in lessonHrefs: print temp + lesson_href, ''.join(svg), lesson_title
def getSoup(self, url): ''' Get the source code. ''' html = getUrlSource(url) if len(html) == 0: return False soup = BeautifulSoup(html) return soup
def getSoup(url): num = 0 while True: time.sleep(1) html = getUrlSource(url) if len(html) != 0 or num >20: break soup = BeautifulSoup(html) return soup
def getSoup(url): num = 0 while True: time.sleep(1) html = getUrlSource(url) if len(html) != 0 or num > 20: break soup = BeautifulSoup(html) return soup
def getSoup(url): num = 0 while True: time.sleep(2) html = getUrlSource(url) if len(html) != 0: print 'Right:\t', url break elif num > 20: print 'Wrong:\t', url break soup = BeautifulSoup(html) return soup
def getContentLXML(self, url): '''''' num = 1 while True: mSourceCode = getUrlSource(url) if len(mSourceCode) != 0: break elif len(mSourceCode) == 0: time.sleep(20) num += 1 elif num > 10: return False page = etree.HTML(mSourceCode) titleModules = page.xpath("//div[@class='at_c']") #Read title if len(titleModules): title = titleModules[0].text else: title = '' #Read Content content = '' try: num = 1 while True: mContentModules = page.xpath( "//body/div[@class='ptb45 bgcolor1 xreader']") for cm in mContentModules: temp = "".join([x for x in cm.itertext()]) content += temp num += 1 if num > 200: break href = self.nextpage(url, num) mSourceCode = getUrlSource(href) if len(mSourceCode) == 0: break page = etree.HTML(mSourceCode) except: print 'Wrong contentAnalysis: ', url return {'title': title, 'content': content}
def getContentBS(self, url): time.sleep(2) num = 1 while True: mSourceCode = getUrlSource(url) if len(mSourceCode) != 0: break elif len(mSourceCode) == 0: time.sleep(20) num += 1 elif num > 10: return False soup = BeautifulSoup(mSourceCode) #Read title title = soup.find("div", {"class": "at_c"}) title = '' if title is None else title.text #Read Content content = '' try: num = 1 while True: temp = soup.find("div", {"class": "ptb45 bgcolor1 xreader"}) if temp is None: break content = '%s%s' % (content, temp.text) num += 1 if num > 200: break href = self.nextpage(url, num) html = getUrlSource(href) if len(html) == 0: break soup = BeautifulSoup(html) except: print 'Wrong contentAnalysis: ', url return {'title': title, 'content': content}
def getSoup(url): num = 0 while True: html = getUrlSource(url) if len(html) != 0: print 'Right:\t', url break elif num <=5: time.sleep(5) num += 1 elif num > 5: print 'Wrong:\t', url break soup = BeautifulSoup(html) return soup