def easouCrawl(): catDict = loadCategory() shukuUrl = 'http://b.easou.com/w/booklib/all_c_l_2_0_1.html' #已完结 shukuHtml = getContent(shukuUrl) if not shukuHtml: return shukuSoup = getSoupByStr(shukuHtml) conds = shukuSoup.select('.condition') if not conds or len(conds) < 1: print 'shuku error, stop ' return for cat in conds[1].select('.n a'): #遍历大类的'全部'页 catUrl = urlparse.urljoin(shukuUrl, cat['href']) topCatName = cat.get_text() booklistHtml = getContent(catUrl) #大类的'全部'页 if not booklistHtml: continue booklistSoup = getSoupByStr(booklistHtml) Catconds = booklistSoup.select('.condition') if not Catconds or len(Catconds) < 1: print 'category error, skip ,', cat.get_text() continue for tag in Catconds[2].select('.n a'): # 先遍历大类下面的标签页 tagUrl = urlparse.urljoin(catUrl, tag['href']) tagName = tag.get_text() if catDict.has_key(tagName): tagObj = catDict[tagName] else: tagObj = catDict[u'全部'] dealTagPage(shukuUrl, tagName, tagObj, tagUrl) for n in range(2, 5): dealTagPage(shukuUrl, tagName, tagObj, tagUrl.replace('1.html', str(n) + '.html')) tag2 = Catconds[2].select('.all a') tagUrl = urlparse.urljoin(catUrl, tag2['href']) tagName = u'其他' + tag2.get_text() if catDict.has_key(tagName): tagObj = catDict[tagName] else: tagObj = catDict[u'全部'] dealTagPage(shukuUrl, tagName, tagObj, tagUrl) for n in range(2, 5): dealTagPage(shukuUrl, tagName, tagObj, tagUrl.replace('1.html', str(n) + '.html'))
def getBookObjFromSQid(id, shuqCategory): bookInfoAPI = 'http://api.shuqireader.com/reader/bc_cover.php?bookId=' + str( id ) + '&book=same&book_num=5&bbs=pinglun&bbs_num=8&bbs_rand_num=1&lastchaps=1&ItemCount=3&soft_id=1&ver=110817&platform=an&placeid=1007&imei=862953036746111&cellid=13&lac=-1&sdk=18&wh=720x1280&imsi=460011992901111&msv=3&enc=666501479540451111&sn=1479540459901111&vc=e8f2&mod=M3' text = getContent(bookInfoAPI) if not (text and len(text) > 160): return None, None root = ElementTree.fromstring(text.encode('utf-8')) BookType = '' if len(root.getiterator('BookType')) > 0: BookType = root.getiterator('BookType')[0].text category = '' if len(root.getiterator('NickName')) > 0: category = root.getiterator('NickName')[0].text tag = '' if len(root.getiterator('ShortNickName')) > 0: tag = root.getiterator('ShortNickName')[0].text tagId = 0 if root.getiterator('NickId') and len(root.getiterator( 'NickId')) > 0 and root.getiterator('NickId')[0].text: tagId = int(root.getiterator('NickId')[0].text) firstCid = 0 if root.getiterator('ChapteridFirst') and len( root.getiterator('ChapteridFirst')) > 0 and root.getiterator( 'ChapteridFirst')[0].text: firstCid = int(root.getiterator('ChapteridFirst')[0].text) if (not BookType) and (not category) and (not tag) and (not tagId): return None, None categoryId = 0 if shuqCategory.has_key(tag): if shuqCategory[tag]['id'] and len(shuqCategory[tag]['id']) > 0: categoryId = int(shuqCategory[tag]['id']) size = 1 if root.getiterator('Size') and len(root.getiterator('Size')) > 0: strSize = root.getiterator('Size')[0].text size = sizeStr2Int(strSize) NumChapter = 1 if root.getiterator('NumChapter') and len(root.getiterator( 'NumChapter')) > 0 and root.getiterator('NumChapter')[0].text: NumChapter = int(root.getiterator('NumChapter')[0].text) source = 'shuqi' + str(id) subtitle = root.getiterator('Description')[0].text title = root.getiterator('BookName')[0].text author = root.getiterator('Author')[0].text imgurl = root.getiterator('ImageExists')[0].text certainBookUrl = 'http://api.shuqireader.com/reader/bc_cover.php?bookId=' + str( id) if not title or len(title) < 1 or len(author) < 1: return None, None bookObj = dict() bookObj['subtitle'] = subtitle bookObj['source'] = source bookObj['rawUrl'] = certainBookUrl bookObj['title'] = title bookObj['chapterNum'] = NumChapter bookObj['imgUrl'] = imgurl bookObj['author'] = author bookObj['size'] = size bookObj['category'] = tag bookObj['type'] = category bookObj['bookType'] = BookType # bookObj['typeCode'] = 100 + tagId # bookObj['categoryCode'] = 100 + categoryId bookObj['typeCode'] = tagId bookObj['categoryCode'] = categoryId bookObj['firstCid'] = firstCid bookObj['viewNum'] = 0 m2 = hashlib.md5() forDigest = title + u'#' + author m2.update(forDigest.encode('utf-8')) digest = m2.hexdigest() bookObj['digest'] = digest return bookObj, digest
def dealTagPage(shukuUrl, tagName, tagObj, tagUrl): tagBooksHtml = getContent(tagUrl) if not tagBooksHtml: return tagBooksSoup = getSoupByStr(tagBooksHtml) for book in tagBooksSoup.select('.listcontent .name a'): t = random.randint(60, 100) time.sleep(t) bookUrl = urlparse.urljoin(tagUrl, book['href']) # 是图书搜索结果页 bookName = book.get_text() bookObj = dict() # 建图书对象,共函数间传递 bookObj['title'] = bookName bookObj['type'] = tagName bookObj['typeCode'] = tagObj['typeCode'] bookObj['categoryCode'] = tagObj['categoryCode'] bookObj['category'] = tagObj['category'] # bookId, bookUUID = insertBook(bookObj) #Book信息入库,获得bookid和uuid bookMidHtml = getContent(bookUrl) if not bookMidHtml: continue bookMidSoup = getSoupByStr(bookMidHtml) lis = bookMidSoup.select('.resultContent li') if not lis or len(lis) < 1: print 'get book result list error, skip, book: ', bookUrl, ' tagUrl: ', tagUrl continue bookLi = lis[0] bookImgs = bookLi.select('.imgShow')[0].select('img') if bookImgs and len(bookImgs) > 0: imgsrc = bookImgs[0]['src'] bookImg = urlparse.urljoin(bookUrl, imgsrc).replace( 'http://b.easou.com/w/resources/imgs/pic.gif', DefaultImg) else: bookImg = DefaultImg certainBookUrl = urlparse.urljoin(bookUrl, bookLi.select('.name a')[0]['href']) author = bookLi.select('.author a')[0].get_text() count = bookLi.select('.count')[0].get_text().replace(u'追书人数:', '').replace( u'人追', '') bookObj['author'] = author bookObj['viewNum'] = count bookObj['imgUrl'] = bookImg # 具体图书介绍页 certainBookHtml = getContent(certainBookUrl) if not certainBookHtml: continue certainBookSoup = getSoupByStr(certainBookHtml) subtitle = certainBookSoup.select('.desc')[0].get_text() source = certainBookSoup.select('.source .t')[0].get_text() bookObj['subtitle'] = subtitle bookObj['source'] = source bookObj['rawUrl'] = certainBookUrl agendaLinks = certainBookSoup.select('.dao .category a') if not agendaLinks or len(agendaLinks) < 1: print 'get book agenda list error, skip, book: ', bookUrl, ' tagUrl: ', tagUrl continue # 图书目录页 agendaHtml = getContent( urlparse.urljoin(certainBookUrl, agendaLinks[0]['href'])) if not agendaHtml: continue agendaSoup = getSoupByStr(agendaHtml) # 终于达到了目录页 caplist = list() caps = agendaSoup.select('.category li a') for i in range(0, len(caps)): cap = caps[i] capUrl = urlparse.urljoin(shukuUrl, cap['href']) capName = cap.get_text() capObj = {} capObj['url'] = capUrl capObj['name'] = capName caplist.append(capObj) pages = agendaSoup.select('.pager a') if pages and len(pages) > 0: for j in range(0, len(pages) - 1): pageA = pages[j] nextPageUrl = urlparse.urljoin(bookUrl, pageA['href']) # 图书目录页 agendaHtml2 = getContent(nextPageUrl) if not agendaHtml2: continue agendaSoup2 = getSoupByStr(agendaHtml2) # 终于达到了目录页 caps = agendaSoup2.select('.category li a') for i in range(0, len(caps)): cap = caps[i] capUrl = urlparse.urljoin(shukuUrl, cap['href']) capName = cap.get_text() capObj = {} capObj['url'] = capUrl capObj['name'] = capName caplist.append(capObj) bookObj['chapterNum'] = len(caplist) # bookAreadyCrawled = insertBook(bookObj) # if not bookAreadyCrawled: # checkCapsSql = 'select count(*) from cn_dushu_article where bookId = %d' % (bookObj['id']) # try: # csor.execute(checkCapsSql) # conn.commit() # results = csor.fetchall() # # if not results or len(results) < 1: # return None # else: # bookObj['id'] = results[0][0] # except Exception as e: # # # 发生错误时回滚 # print 'check cap count failed ,skip', e if not insertBook(bookObj): # Book信息入库,bookid和uuid写入bookObj print 'error, skip, bookName', bookObj['title'] continue bookId = bookObj['id'] existsCaps = getExistsCaps(bookId) for m in range(0, len(caplist)): if existsCaps and len(existsCaps) > 0: noNeedCrawlCap = False for cap in existsCaps: if cap[0] == m and cap[1] > 300: noNeedCrawlCap = True break if noNeedCrawlCap: print 'cap exists, no need to recrawl, bookName', bookObj[ 'title'], ' bookId', bookId, ' capIdex: ', m continue capUrl = caplist[m]['url'] capName = caplist[m]['name'] p.apply_async(insertCap, args=(bookObj, capUrl, capName, m, queue))
def juren(): csor, conn = getConn() #小升初笑话 for i in range(1, 25): #笑话 # url = 'http://aoshu.juren.com/chzt/shuxueshouchaobao/xiaoxiaohua/index_' + str(i) + '.html' #故事 url = 'http://aoshu.juren.com/tiku/mryt/yimryt/index_' + str( i) + '.html' #名人 # url = if i == 1: # url = 'http://aoshu.juren.com/chzt/shuxueshouchaobao/xiaoxiaohua/' # url = 'http://aoshu.juren.com/chzt/shuxueshouchaobao/xiaogushi/' # url = 'http://aoshu.juren.com/chzt/shuxueshouchaobao/neirongsucai/' url = 'http://aoshu.juren.com/tiku/mryt/yimryt/' content = getContent(url) if not content: print 'get content failed, url: ', url continue soup = getSoupByStr(content) if not soup: print 'get soup filed, url:', url continue for listting in soup.select(".listing1"): for a in listting.select('a'): text = a.get_text() titles = text.split(u':') if len(titles) < 2: titles = text.split(u':') if len(titles) < 2: title = text else: title = titles[1] deatilUrl = a['href'] contentHtml = getContent(deatilUrl) if not contentHtml: print 'get detail failed' continue contentSoup = getSoupByStr(contentHtml).select('.mainContent') content = '' ps = contentSoup[0].select('p') length = len(ps) for j in range(1, length): pJ = ps[j] pText = pJ.get_text() if u'本期精彩专题推荐' in pText or u'本期' in pText or u'精彩推荐' in pText\ or u'点击下一页查看答案' in pText or u'下一页查看答案' in pText or u'查看答案' in pText\ or len(pJ.select('a')) > 0: print 'not content,break, text:' + pText break content += unicode(pJ) contentHtml2 = getContent(deatilUrl.replace( '.html', '_2.html')) if not contentHtml2: print 'get detail failed' continue # contentSoup2 = getSoupByStr(contentHtml2.replace('<br /></p>','')).select('.mainContent') contentSoup2 = getSoupByStr(contentHtml2).select( '.mainContent') ps = contentSoup2[0].select('p') length = len(ps) for j in range(0, length): pJ = ps[j] pText = pJ.get_text() if u'本期精彩专题推荐' in pText or u'本期' in pText or u'精彩推荐' in pText or len( pJ.select('a')) > 0: print 'not content,break, text:' + pText break content += unicode(pJ) sql = "INSERT ignore INTO daily(name, \ type, content,stage, gred) \ VALUES ('%s', '%d', '%s', '%s', '%d')" % \ (title, 3, content, '3', 1) try: # 执行sql语句 print sql csor.execute(sql) # 提交到数据库执行 print conn.commit() except: # 发生错误时回滚 conn.rollback() conn.close()
results = csor.fetchall() lastTime = 0 for row in results: # content = row[1] # content = row[4].replace('mi', 'mo') id = row[0] # url = row[1] url = 'http://www.3dllc.com/html/37/37023/9515879.html' # if not u'easou' in url: # continue newContent = getContent(url) doc = Document(newContent) content = doc.summary(html_partial=True) # # soup = getSoupByStr(newContent) # # ps = soup.select('#chapterContent')[0] # # ps.select('div')[0].unwrap() # # ps.unwrap() # for water in soup.select('.watermark'): # water.extract() #