def main(self): f_handler=open('out.log', 'w') sys.stdout=f_handler page = open('page.txt', 'r') content = page.readline() start_page = int(content.strip())-1 page.close() print (self.getCurrentTime(),"开始页码",start_page) print (self.getCurrentTime(),"爬虫正在启动,开始爬取爱问知识人问题") self.total_num = self.getTotalPageNum() print (self.getCurrentTime(),"获取到目录页面个数",self.total_num,"个") if not start_page: start_page = self.total_num for x in range(1,start_page): print (self.getCurrentTime(),"正在抓取第",start_page-x+1,"个页面") try: self.getQuestions(start_page-x+1) except urllib.error.URLError as e: if hasattr(e, "reason"): print (self.getCurrentTime(),"某总页面内抓取或提取失败,错误原因", e.reason) except Exception as e: print (self.getCurrentTime(),"某总页面内抓取或提取失败,错误原因:",e) if start_page-x+1 < start_page: f=open('page.txt','w') f.write(str(start_page-x+1)) print (self.getCurrentTime(),"写入新页码",start_page-x+1) f.close()
def main(self): f_handler=open('out.log', 'w',encoding='utf-8') # sys.stdout=f_handler page = open('page.txt', 'r') content = page.readline() start_page = int(content.strip()) - 1 page.close() print(self.getCurrentTime()+"开始页码"+str(start_page)) print(self.getCurrentTime()+"爬虫正在启动,开始爬取爱问知识人问题") self.total_num = self.getTotalPageNum() print(self.getCurrentTime()+"获取到目录页面个数"+str(self.total_num)+"个") if not start_page: start_page = self.total_num print(str(start_page)) for x in range(0,start_page): print(self.getCurrentTime()+"正在抓取第"+str(start_page-x+1) +"个页面") try: self.getQuestions(start_page-x+1) except urllib.error.HTTPError as e: if hasattr(e, "reason"): print(self.getCurrentTime()+"某总页面内抓取或提取失败,错误原因"+e.reason) if start_page-x+1 < start_page: f=open('page.txt','w') f.write(str(start_page-x+1)) print(self.getCurrentTime()+"写入新页码"+str(start_page-x+1)) f.close()
def main(self): f_handler=open('out.log', 'w') sys.stdout=f_handler page = open('page.txt', 'r') content = page.readline() start_page = int(content.strip()) - 1 page.close() print self.getCurrentTime(),"开始页码",start_page print self.getCurrentTime(),"爬虫正在启动,开始爬取爱问知识人问题" self.total_num = self.getTotalPageNum() print self.getCurrentTime(),"获取到目录页面个数",self.total_num,"个" for x in range(1,start_page): print self.getCurrentTime(),"正在抓取第",start_page-x+1,"个页面" try: self.getQuestions(start_page-x+1) except urllib2.URLError, e: if hasattr(e, "reason"): print self.getCurrentTime(),"某总页面内抓取或提取失败,错误原因", e.reason except Exception,e: print self.getCurrentTime(),"某总页面内抓取或提取失败,错误原因:",e
def main(self): f_handler = open('out.log', 'w') sys.stdout = f_handler page = open('page.txt', 'r') content = page.readline() start_page = int(content.strip()) - 1 page.close() print self.getCurrentTime(), "开始页码", start_page print self.getCurrentTime(), "爬虫正在启动,开始爬取爱问知识人问题" self.total_num = self.getTotalPageNum() print self.getCurrentTime(), "获取到目录页面个数", self.total_num, "个" for x in range(1, start_page): print self.getCurrentTime(), "正在抓取第", start_page - x + 1, "个页面" try: self.getQuestions(start_page - x + 1) except urllib2.URLError, e: if hasattr(e, "reason"): print self.getCurrentTime(), "某总页面内抓取或提取失败,错误原因", e.reason except Exception, e: print self.getCurrentTime(), "某总页面内抓取或提取失败,错误原因:", e
def start(self): f_handler = open('out.log','w') sys.stdout = f_handler page = open('page.txt','r') content = page.readline() start_page = 10 page.close() print self.getCurrentTime(),"开始页码",start_page print self.getCurrentDate(),"开始爬取" self.total_num = self.getTotalPageNum() print self.getCurrentTime(),"获取到目录页面个数",self.total_num,"个" if not start_page: start_page = self.total_num for x in range(1,10): print self.getCurrentTime(),"正在抓取第",start_page-x+1,"个页面" try: self.getQuestions(start_page-x+1) except urllib2.URLError, e: if hasattr(e, "reason"): print self.getCurrentTime(),"某总页面内抓取或提取失败,错误原因", e.reason except Exception,e: print self.getCurrentTime(),"某总页面内抓取或提取失败,错误原因:",e
def main(self): f_handler = open('out.log', 'w') sys.stdout = f_handler page = open('page.txt', 'r') content = page.readline() start_page = int(content.strip()) - 1 page.close() print self.getCurrentTime(), "Starting page", start_page print self.getCurrentTime(), "Web crawler is starting, crawles Aiwen" self.total_num = self.getTotalPageNum() print self.getCurrentTime(), "Get the number of pages: ", self.total_num if not start_page: start_page = int(self.total_num) for x in range(1, start_page): print self.getCurrentTime(), "Now crawling the ", start_page-x+1, "page" try: self.getQuestions(start_page-x+1) except urllib2.URLError, e: if hasattr(e, "reason"): print self.getCurrentTime(), "Extract info failed, reason: ", e.reason except Exception, e: print self.getCurrentTime(), "Extract info failed, reason: ", e
def main(self): f_handler = open('out.log', 'w') sys.stdout = f_handler page = open('page.txt', 'r') content = page.readline() start_page = int(content.strip()) - 1 page.close() print self.getCurrentTime(), "开始页码", start_page print self.getCurrentTime(), "爬虫正在启动,开始爬取爱问知识人问题" self.total_num = 100 # 新版的页面总的页数为100 print self.getCurrentTime(), "获取到目录页面个数", self.total_num, "个" if not start_page: start_page = self.total_num for x in range(1, start_page): print self.getCurrentTime(), "正在抓取第", start_page - x + 1, "个页面" try: # 这里页面的页码的格式不是按照正常的1 2 3 4,而是用了其他的算法产生页码 self.getQuestions(start_page - x + 1) except urllib2.URLError, e: if hasattr(e, "reason"): print self.getCurrentTime(), "某总页面内抓取或提取失败,错误原因", e.reason except Exception, e: print self.getCurrentTime(), "某总页面内抓取或提取失败,错误原因:", e
def main(self): f_handler = open('out.log', 'w') sys.stdout = f_handler page = open('page.txt', 'r') content = page.readline() start_page = int(content.strip()) - 1 page.close() print self.getCurrentTime(),"start page is ", start_page print self.getCurrentTime(),"spider running" self.total_num = self.getTotalPageNum() print self.getCurrentTime(), "get total index page num", self.total_num if not start_page: start_page = self.total_num for x in range(1, start_page): print self.getCurrentTime(), "parsing No.", start_page - x + 1, "'s page" try: self.getQuestions(start_page - x + 1) except urllib2.URLError, e: if hasattr(e, "reason"): print self.getCurrentTime(),"parse this page failed, reason:", e.reason except Exception,e: print self.getCurrentTime,"parse this page failed, reason:", e