コード例 #1
0
 def spiderDownloadImgIml(self, id, imgUrl, title):  # 下载图片
     start_time = time.clock()
     print("[DownloadImg]downlaod image:%s" % imgUrl)
     try:
         # Default_Header['User-Agent'] = UserAgent_List[random.choice]random.choice(UserAgent_List),
         response = requests.get(
             imgUrl, headers=Constants.getDefaultHeaderForDownload(imgUrl))
     except requests.exceptions.ConnectionError:
         store2ErrorFile('【错误】当前图片无法下载', imgUrl)
         self.changeDetailState(id, Constants.STATUS_FAILURE)
         return
     htmlContent = response.content
     # 此处存入数据库好了
     names = imgUrl.split('/')
     if (len(names) > 1):
         name = names[len(names) - 1]
     else:
         name = imgUrl
     tmpPath = 'res/' + title + '/' + name
     self.mkdir('res/', title)
     with codecs.open(tmpPath, 'ab') as file:
         file.write(htmlContent)
     end_time = time.clock()
     total_time = float(end_time - start_time)
     self.changeDetailState(id, Constants.STATUS_SUCCESS)
     print("downlaod image:%s 。finish,总用时 : %f s" % (imgUrl, total_time))
コード例 #2
0
 def parseHtmlForMain(self, htmlContent):  # 解析html
     soup = BeautifulSoup(htmlContent, "html.parser")
     try:
         # 1 ,获取title
         article = soup.find('div', attrs={'class': 'article'})
         title = article.find('h2').string
         infos = article.find('div', attrs={'class': 'info'}).findAll('i')
         # 2 ,获取time
         time = infos[0].string
         if len(time) > 0:
             time = time[4:]
         popularNum = infos[3].string
         if len(popularNum) > 0:
             popularNum = popularNum[3:-1]
         likeNum = infos[4].string
         if len(likeNum) > 0:
             likeNum = likeNum[3:-1]
         # 3. endIndex
         clearfloat = soup.find('div', attrs={
             'class': 'clearfloat'
         }).find('script').string
         # if len(clearfloat) > 0:
         #     clearfloat = clearfloat[1:-1]
         clearfloatLeftIndex = clearfloat.index('[')
         clearfloatRightIndex = clearfloat.index(']')
         clearfloat = clearfloat[clearfloatLeftIndex +
                                 1:clearfloatRightIndex]
     except requests.exceptions.ConnectionError:
         store2ErrorFile('【错误】parseHtmlForMain', htmlContent)
         title = ''
     return title, time, popularNum, likeNum, clearfloat
コード例 #3
0
    def store2DBForDetail(self, maiId, picinfo):  # 保存到sqlite
        self.conn = sqlite3.connect(Constants.DB_PATH)
        self.cursor = self.conn.cursor()
        self.cursor.execute(
            'create table if not exists meitiDetail (id integer primary key autoincrement, '
            'mainId int, url text, status int)')

        picinfos = picinfo.split(',')
        if len(picinfos) != 3:
            store2ErrorFile("len(picinfos) != 3",
                            'maiId:' + maiId + '...' + picinfos)
            return

        try:
            for i in range(1, int(picinfos[2])):
                newUrl = Constants.DB_IMAGE + picinfos[0] + '/' + picinfos[
                    1] + '/' + str(i) + '.jpg'
                self.cursor.execute(
                    "insert into meitiDetail (mainId, url, status) values (%d,\'%s\',0);"
                    % (maiId, newUrl.strip()))
        except Exception as err:
            store2ErrorFile("插入数据出错", err.__str__())
        finally:
            pass
        # 关闭Cursor:
        self.cursor.close()
        # 提交事务:
        self.conn.commit()
        # 关闭Connection:
        self.conn.close()
コード例 #4
0
 def getMainPage(self, url):
     '''
     获取网页数据
     :param url: 路径
     :return:  网页数据
     '''
     htmlContent = ''
     try:
         response = requests.get(url, headers=Constants.Default_Header)
         #  www.mmjpg.com 没有正确编码,自行定义编码
         response.encoding = 'utf-8'
         htmlContent = response.text
     except requests.exceptions.ConnectionError:
         store2ErrorFile('【错误】URL访问', url)
     return htmlContent
コード例 #5
0
 def startSpider(self, url):
     '''
      开启爬虫
     :return:
     '''
     if not url:
         return
     print("===========url start============")
     print("url:%s" % url)
     htmlContent = self.getMainPage(url)
     if not htmlContent:
         return
     title, time, popularNum, likeNum, clearfloat = self.parseHtmlForMain(
         htmlContent)
     if len(title) <= 0:
         store2ErrorFile('【错误】URL解析', url)
         return
     lastId = self.store2DBForMain(title, time, popularNum, likeNum)
     self.store2DBForDetail(lastId, clearfloat)
     print("===========finish============")
コード例 #6
0
    def store2DBForMain(self, title, time, popularNum, likeNum):  # 保存到sqlite
        self.conn = sqlite3.connect(Constants.DB_PATH)
        self.cursor = self.conn.cursor()
        self.cursor.execute(
            'create table if not exists meitiMain (id integer primary key autoincrement, '
            'title text, time datetime, popularNum int, likeNum int)')
        try:
            self.cursor.execute(
                "insert into meitiMain (title, time, popularNum, likeNum) values (\'%s\',\'%s\', \'%s\',\'%s\');"
                % (title.strip(), time.strip(), popularNum, likeNum))
        except Exception as err:
            store2ErrorFile("插入数据出错", err.__str__())
        finally:
            pass
        # 关闭Cursor:
        self.cursor.close()
        id = self.cursor.lastrowid
        # 提交事务:
        self.conn.commit()

        # 关闭Connection:
        self.conn.close()
        return id