def spiderDownloadImgIml(self, id, imgUrl, title): # 下载图片 start_time = time.clock() print("[DownloadImg]downlaod image:%s" % imgUrl) try: # Default_Header['User-Agent'] = UserAgent_List[random.choice]random.choice(UserAgent_List), response = requests.get( imgUrl, headers=Constants.getDefaultHeaderForDownload(imgUrl)) except requests.exceptions.ConnectionError: store2ErrorFile('【错误】当前图片无法下载', imgUrl) self.changeDetailState(id, Constants.STATUS_FAILURE) return htmlContent = response.content # 此处存入数据库好了 names = imgUrl.split('/') if (len(names) > 1): name = names[len(names) - 1] else: name = imgUrl tmpPath = 'res/' + title + '/' + name self.mkdir('res/', title) with codecs.open(tmpPath, 'ab') as file: file.write(htmlContent) end_time = time.clock() total_time = float(end_time - start_time) self.changeDetailState(id, Constants.STATUS_SUCCESS) print("downlaod image:%s 。finish,总用时 : %f s" % (imgUrl, total_time))
def parseHtmlForMain(self, htmlContent): # 解析html soup = BeautifulSoup(htmlContent, "html.parser") try: # 1 ,获取title article = soup.find('div', attrs={'class': 'article'}) title = article.find('h2').string infos = article.find('div', attrs={'class': 'info'}).findAll('i') # 2 ,获取time time = infos[0].string if len(time) > 0: time = time[4:] popularNum = infos[3].string if len(popularNum) > 0: popularNum = popularNum[3:-1] likeNum = infos[4].string if len(likeNum) > 0: likeNum = likeNum[3:-1] # 3. endIndex clearfloat = soup.find('div', attrs={ 'class': 'clearfloat' }).find('script').string # if len(clearfloat) > 0: # clearfloat = clearfloat[1:-1] clearfloatLeftIndex = clearfloat.index('[') clearfloatRightIndex = clearfloat.index(']') clearfloat = clearfloat[clearfloatLeftIndex + 1:clearfloatRightIndex] except requests.exceptions.ConnectionError: store2ErrorFile('【错误】parseHtmlForMain', htmlContent) title = '' return title, time, popularNum, likeNum, clearfloat
def store2DBForDetail(self, maiId, picinfo): # 保存到sqlite self.conn = sqlite3.connect(Constants.DB_PATH) self.cursor = self.conn.cursor() self.cursor.execute( 'create table if not exists meitiDetail (id integer primary key autoincrement, ' 'mainId int, url text, status int)') picinfos = picinfo.split(',') if len(picinfos) != 3: store2ErrorFile("len(picinfos) != 3", 'maiId:' + maiId + '...' + picinfos) return try: for i in range(1, int(picinfos[2])): newUrl = Constants.DB_IMAGE + picinfos[0] + '/' + picinfos[ 1] + '/' + str(i) + '.jpg' self.cursor.execute( "insert into meitiDetail (mainId, url, status) values (%d,\'%s\',0);" % (maiId, newUrl.strip())) except Exception as err: store2ErrorFile("插入数据出错", err.__str__()) finally: pass # 关闭Cursor: self.cursor.close() # 提交事务: self.conn.commit() # 关闭Connection: self.conn.close()
def getMainPage(self, url): ''' 获取网页数据 :param url: 路径 :return: 网页数据 ''' htmlContent = '' try: response = requests.get(url, headers=Constants.Default_Header) # www.mmjpg.com 没有正确编码,自行定义编码 response.encoding = 'utf-8' htmlContent = response.text except requests.exceptions.ConnectionError: store2ErrorFile('【错误】URL访问', url) return htmlContent
def startSpider(self, url): ''' 开启爬虫 :return: ''' if not url: return print("===========url start============") print("url:%s" % url) htmlContent = self.getMainPage(url) if not htmlContent: return title, time, popularNum, likeNum, clearfloat = self.parseHtmlForMain( htmlContent) if len(title) <= 0: store2ErrorFile('【错误】URL解析', url) return lastId = self.store2DBForMain(title, time, popularNum, likeNum) self.store2DBForDetail(lastId, clearfloat) print("===========finish============")
def store2DBForMain(self, title, time, popularNum, likeNum): # 保存到sqlite self.conn = sqlite3.connect(Constants.DB_PATH) self.cursor = self.conn.cursor() self.cursor.execute( 'create table if not exists meitiMain (id integer primary key autoincrement, ' 'title text, time datetime, popularNum int, likeNum int)') try: self.cursor.execute( "insert into meitiMain (title, time, popularNum, likeNum) values (\'%s\',\'%s\', \'%s\',\'%s\');" % (title.strip(), time.strip(), popularNum, likeNum)) except Exception as err: store2ErrorFile("插入数据出错", err.__str__()) finally: pass # 关闭Cursor: self.cursor.close() id = self.cursor.lastrowid # 提交事务: self.conn.commit() # 关闭Connection: self.conn.close() return id