Example #1
0
def testChapterContent(url):
    req = urllib2.Request(url)
    html = urllib2.urlopen(req).read()
    re_script = re.compile('<\s*script[^>]*>[^<]*<\s*/\s*script\s*>',
                           re.I)  #Script
    html = re_script.sub('', html)
    soup = BeautifulSoup.BeautifulSoup(html)
    cont = soup.find('div', id="content")

    del_dev = cont.findAll('div')

    for del_item in del_dev:
        if del_item.has_key('class') or del_item.has_key('id'):
            del_item.extract()

    cont = str(cont)
    cont = cont.replace('<div id="content">', '')
    cont = cont.replace('</div>', '')
    cont = cont.replace('<div>', '')
    cont = cont.strip()
    cont = Db.escape(cont)
    print(cont)
    sql = 'insert into sg_chapter_0(bk_id,name,content,publish_time,ch_sort) values("%s","%s","%s","%s", "%s")'
    print(sql % ('9', 'test', cont, '2015-09-13 23:25:32', '1'))
    db.execute(sql % ('9', 'test', cont, '2015-09-13 23:25:32', '1'))
Example #2
0
    def chapter(self):

        res = {}
        res['name'] = self.name
        cont = self.soup.find('div', id="content")
        cont = str(cont)
        cont = cont.replace('<div id="content">', '')
        cont = cont.replace('</div>', '')
        cont = cont.replace('<script>readx();</script>', '')
        cont = cont.strip()
        cont = Db.escape(cont)

        res['content'] = cont
        res['publish_time'] = time.strftime('%Y-%m-%d %H:%M:%S')
        return res
Example #3
0
	def chapter(self):

		res = {}
		res['name'] = self.name
		cont = self.soup.find('div', id="content")
		cont = str(cont)
		cont = cont.replace('<div id="content">', '')
		cont = cont.replace('</div>', '')
		cont = cont.replace('<script>readx();</script>', '')
		cont = cont.strip()
		cont = Db.escape(cont)

		res['content'] = cont
		res['publish_time'] = time.strftime('%Y-%m-%d %H:%M:%S')
		return res
Example #4
0
    def chapter(self):

        res = {}
        res['name'] = self.name
        cont = self.soup.find('div', id="content")
        del_dev = cont.findAll('div')

        for del_item in del_dev:
            if del_item.has_key('class') or del_item.has_key('id'):
                del_item.extract()

        cont = str(cont)
        cont = cont.replace('<div id="content">', '')
        cont = cont.replace('</div>', '')
        cont = cont.replace('<div>', '')
        cont = cont.strip()
        cont = Db.escape(cont)

        res['content'] = cont
        res['publish_time'] = time.strftime('%Y-%m-%d %H:%M:%S')

        return res
Example #5
0
	def chapter(self):

		res = {}
		res['name'] = self.name
		cont = self.soup.find('div', id="content")
		del_dev = cont.findAll('div')

		for del_item in del_dev:
			if del_item.has_key('class') or del_item.has_key('id'):
				del_item.extract()

		cont = str(cont)
		cont = cont.replace('<div id="content">', '')
		cont = cont.replace('</div>', '')
		cont = cont.replace('<div>', '')
		cont = cont.strip()
		cont = Db.escape(cont)

		res['content'] = cont
		res['publish_time'] = time.strftime('%Y-%m-%d %H:%M:%S')

		return res
Example #6
0
        book_item['spider_url'] = domain + book_href
        book_item['name'] = book_name
        books.append(book_item)

    sql = 'insert into spider_book(name, spider_url, spider_engine) values '
    insert_val = []
    for book in books:

        if existsBook(book['name']):
            # print("作品:%s已经存在啦" % book['name'])
            continue

        sql += '("%s", "%s", "%s"),'
        insert_val.append(book['name'])
        insert_val.append(book['spider_url'])
        insert_val.append('biquge')

    if len(insert_val) > 0:
        sql = sql[0:-1]
        db.execute(sql % tuple(insert_val))


if __name__ == '__main__':

    domain = 'http://www.biquge.la'

    db = Db()
    soup = getSoup()

    if soup:
        do(soup)