def testChapterContent(url): req = urllib2.Request(url) html = urllib2.urlopen(req).read() re_script = re.compile('<\s*script[^>]*>[^<]*<\s*/\s*script\s*>', re.I) #Script html = re_script.sub('', html) soup = BeautifulSoup.BeautifulSoup(html) cont = soup.find('div', id="content") del_dev = cont.findAll('div') for del_item in del_dev: if del_item.has_key('class') or del_item.has_key('id'): del_item.extract() cont = str(cont) cont = cont.replace('<div id="content">', '') cont = cont.replace('</div>', '') cont = cont.replace('<div>', '') cont = cont.strip() cont = Db.escape(cont) print(cont) sql = 'insert into sg_chapter_0(bk_id,name,content,publish_time,ch_sort) values("%s","%s","%s","%s", "%s")' print(sql % ('9', 'test', cont, '2015-09-13 23:25:32', '1')) db.execute(sql % ('9', 'test', cont, '2015-09-13 23:25:32', '1'))
def chapter(self): res = {} res['name'] = self.name cont = self.soup.find('div', id="content") cont = str(cont) cont = cont.replace('<div id="content">', '') cont = cont.replace('</div>', '') cont = cont.replace('<script>readx();</script>', '') cont = cont.strip() cont = Db.escape(cont) res['content'] = cont res['publish_time'] = time.strftime('%Y-%m-%d %H:%M:%S') return res
def chapter(self): res = {} res['name'] = self.name cont = self.soup.find('div', id="content") del_dev = cont.findAll('div') for del_item in del_dev: if del_item.has_key('class') or del_item.has_key('id'): del_item.extract() cont = str(cont) cont = cont.replace('<div id="content">', '') cont = cont.replace('</div>', '') cont = cont.replace('<div>', '') cont = cont.strip() cont = Db.escape(cont) res['content'] = cont res['publish_time'] = time.strftime('%Y-%m-%d %H:%M:%S') return res
book_item['spider_url'] = domain + book_href book_item['name'] = book_name books.append(book_item) sql = 'insert into spider_book(name, spider_url, spider_engine) values ' insert_val = [] for book in books: if existsBook(book['name']): # print("作品:%s已经存在啦" % book['name']) continue sql += '("%s", "%s", "%s"),' insert_val.append(book['name']) insert_val.append(book['spider_url']) insert_val.append('biquge') if len(insert_val) > 0: sql = sql[0:-1] db.execute(sql % tuple(insert_val)) if __name__ == '__main__': domain = 'http://www.biquge.la' db = Db() soup = getSoup() if soup: do(soup)