def fromCategoryId(categoryId): url = MianFeiTXTChannelBaseUrl + '?' + paramMap().mianfeiTXT().put('secondCategoryId', 101)\ .put('thirdCategoryId',0).put('filterId', 0).put('sortId',1).put('pageSize', 2000).put('pageNum',1)\ .mianfeiTXTSign().toUrl() baseInfoContent = getContentWithUA(url) if not baseInfoContent: baseInfoContent = getContentWithUA(url) baseObj = json.loads(baseInfoContent) for bookObj in baseObj['data']['books']: mid = bookObj['id'] handleByMTID(mid)
def handlChapByBookObjChapObj(allowUpdate, bookObj, chapObj): chapContentUrl = chapObj['url'] chapContent = getContentWithUA(chapContentUrl) chapContentObj = json.loads(chapContent) if not chapContentObj or not chapContentObj['content'] or len(chapContentObj['content']) < MinChapContentLength: myLogging.error('zid %s content too small skip, chapContentUrl %s', bookObj['id'], chapContentUrl) return 0 chapObj.update(chapContentObj) chapObj['title'] = chapObj['name'] chapObj['rawUrl'] = chapContentUrl chapObj['idx'] = int(chapObj['serialNumber']) del chapObj['serialNumber'] chapObj['size'] = len(chapObj['content']) chapObj['bookId'] = bookObj['id'] chapObj['source'] = bookObj['source'] chapObj['bookUUID'] = bookObj['digest'] digest = getCapDigest(bookObj, chapObj, chapObj['bookChapterId']) chapObj['digest'] = digest chapObj['content'] = textClean(chapObj['content']) capId = insertCapWithCapObj(chapObj, allowUpdate=allowUpdate) # aftInsertCap = time.time() # insertCap = insertCap + (aftInsertCap - befInsertCap) if not capId: myLogging.error('no chapId cid %s', chapObj['bookChapterId']) return 0 uploadJson2Bucket(str(chapObj['id']) + '.json', json.dumps(chapObj)) return chapObj['idx']
def getBookObjBiZid(zid): bookInfoUrl = ZSSQBOOKINFOBASEURL + str(zid) bookInfoText = getContentWithUA(bookInfoUrl) if not bookInfoText: return None return json.loads(bookInfoText)
def getSourceId(qid): srcUrl = srcListBaseUrl % str(qid) srcListContent = getContentWithUA(srcUrl) if not srcListContent: return srcJsonObj = json.loads(srcListContent) if not srcJsonObj or not srcJsonObj.has_key('items'): myLogging.error('no srcObj items qid %s', qid) return srcItems = srcJsonObj['items'] if len(srcItems.keys()) < 1: myLogging.error(' srcObj items len < 1 qid %s', qid) return if srcItems.has_key('api.zhuishuwang.com'): return srcItems['api.zhuishuwang.com'][0]['book_source_id'] # updateTIme = 0 # resId = '' # for itmkey in srcItems.keys(): # if srcItems[itmkey][0]['update_time'] > updateTIme: # resId = srcItems[itmkey][0]['book_source_id'] # updateTIme = srcItems[itmkey][0]['update_time'] # # return resId raise InputException('no zhuishuwang source, skip')
def dealById(baseUrl, conn, csor, id): # slp = random.randint(1, 100) # time.sleep(0.01 * slp) url = baseUrl + str(id) + '.json' content = getContentWithUA(url, ua) if not content or len(content) < 60: print id, 'content', content # continue return jsonObj = json.loads(content) data = jsonObj['data'][0] if not data or len(str(data)) < 10: print id, 'data:', data return # continue companyType = data['companyType'] webName = data['webName'] companyName = data['companyName'] liscense = data['liscense'] examineDate = data['examineDate'].strip() webSite = ','.join(data['webSite']) # sql = """insert ignore into com_base (id,companyName,companyType,examineDate,liscense,source,webSite,webName) values (%s,%s,%s,%s,%s,%s,%s,%s);""" % (str(id), companyName, companyType,examineDate, liscense, "tianyacha",webSite,webName) try: csor.execute( """insert ignore into com_base (id,companyName,companyType,examineDate,liscense,source,webSite,webName) values (%s,%s,%s,%s,%s,%s,%s,%s);""", (str(id), companyName, companyType, examineDate, liscense, "tianyacha", webSite, webName)) conn.commit() except Exception as e: # # 发生错误时回滚 print e
def getChapsByBocId(bocId): chapListUrl = 'http://api.zhuishushenqi.com/btoc/%s?view=chapters' % ( bocId) chapsText = getContentWithUA(chapListUrl) # if not chapsText: # return chapListObj = json.loads(chapsText) return chapListObj
def getContentByUrl(url): capText = getContentWithUA(url, ua) if not (capText and len(capText) > 30): print 'cap content too short ,skip and del book' return None capRoot = ElementTree.fromstring(capText.encode('utf-8')) # ChapterName = '' # if len(capRoot.getiterator('ChapterName')) > 0: # ChapterName = capRoot.getiterator('ChapterName')[0].text ChapterContent = '' if len(capRoot.getiterator('ChapterContent')) > 0: ChapterContent = capRoot.getiterator('ChapterContent')[0].text # if ('http://' in ChapterContent and len(ChapterContent) < 250): # print 'cap content is url ,skip and del book', bookId, ' : ', ChapterContent # delBookById(bookId) # return None WordsCount = '' # if len(capRoot.getiterator('WordsCount')) > 0: # WordsCount = capRoot.getiterator('WordsCount')[0].text PageCount = 1 if len(capRoot.getiterator('PageCount')) > 0: PageCount = int(capRoot.getiterator('PageCount')[0].text) if PageCount > 1: for i in range(2, PageCount + 1): pageIndex = i capApi2 = url.replace('pageIndex=' + str(pageIndex - 1),'pageIndex=' + str(pageIndex - 1))#capListAPIBase + str(bookId) + '&chapterid=' + str(capId) + '&pageIndex=' + str( # pageIndex) + '&bg=0' + capListAPIDeviceInfo capText2 = getContentWithUA(capApi2, ua) if not (capText2 and len(capText2) > 160): return capRoot2 = ElementTree.fromstring(capText2.encode('utf-8')) ChapterContent2 = '' if len(capRoot2.getiterator('ChapterContent')) > 0: ChapterContent2 = capRoot2.getiterator('ChapterContent')[0].text if ChapterContent == ChapterContent2: break ChapterContent = ChapterContent + ChapterContent2 return ChapterContent
def startFromCId(): baseUrl = 'http://api.shuqireader.com/reader/bc_storylist.php?pagesize=40&PageType=category&item=allclick&pageIndex=' cc = '&cid=' page = 1 shuqCategory = loadShuQSeqC() # shuqCategory2 = loadShuQC() # totleSize = 220 for cid in shuqCategory.keys(): try: url = baseUrl + str(page) + cc + str(cid) + capListAPIDeviceInfo urlContent = getContentWithUA(url, ua) if not (urlContent and len(urlContent) > 30): continue capRoot = ElementTree.fromstring(urlContent.encode('utf-8')) totleSize = int(capRoot.attrib['TotalCount']) / 40 + 1 try: dealBookListPrintBooks(urlContent) for page in range(totleSize, 0, -1): url = baseUrl + str(page) + cc + str(cid) + capListAPIDeviceInfo urlContent = getContentWithUA(url,ua) if not (urlContent and len(urlContent) > 30): continue dealBookListPrintBooks( urlContent) except Exception as e1: print 'deal one page error, cid: ',cid,' page: ' ,page except Exception as e: print "cid : ", cid, 'error: ',e
def search(searchInput): if isinstance(searchInput, unicode): searchInput = searchInput.encode('utf-8') url = ZSSQSEARCHBASEURL + quote(searchInput) searchResContent = getContentWithUA(url) if not searchResContent: return None searchResObj = json.loads(searchResContent) if not searchResObj or not searchResObj.has_key('books'): return return searchResObj
def getContentFromXml(bookId, capId, xml): pageIndex = 1 capListAPIBase = 'http://api.shuqireader.com/reader/bc_showchapter.php?bookId=' # capApi = capListAPIBase + str(bookId) + '&chapterid=' + str(capId) + '&pageIndex=' + str( # pageIndex) + '&bg=0' + capListAPIDeviceInfo # capText = getContentWithUA(capApi, ua) # if not (capText and len(capText) > 30): # print 'cap content too short ,skip and del book' # delBookById(bookId) # return None capRoot = ElementTree.fromstring(xml.encode('utf-8')) # ChapterName = '' # if len(capRoot.getiterator('ChapterName')) > 0: # ChapterName = capRoot.getiterator('ChapterName')[0].text ChapterContent = '' if len(capRoot.getiterator('ChapterContent')) > 0: ChapterContent = capRoot.getiterator('ChapterContent')[0].text # if ('http://' in ChapterContent and len(ChapterContent) < 250): # print 'cap content is url ,skip and del book', bookId, ' : ', ChapterContent # delBookById(bookId) # return None WordsCount = '' # if len(capRoot.getiterator('WordsCount')) > 0: # WordsCount = capRoot.getiterator('WordsCount')[0].text PageCount = 1 if len(capRoot.getiterator('PageCount')) > 0: PageCount = int(capRoot.getiterator('PageCount')[0].text) if PageCount > 1: for i in range(2, PageCount + 1): pageIndex = i capApi2 = capListAPIBase + str(bookId) + '&chapterid=' + str(capId) + '&pageIndex=' + str( pageIndex) + '&bg=0' + capListAPIDeviceInfo capText2 = getContentWithUA(capApi2, ua) if not (capText2 and len(capText2) > 160): return capRoot2 = ElementTree.fromstring(capText2.encode('utf-8')) ChapterContent2 = '' if len(capRoot2.getiterator('ChapterContent')) > 0: ChapterContent2 = capRoot2.getiterator('ChapterContent')[0].text ChapterContent = ChapterContent + ChapterContent2 return ChapterContent
def startFromLatestAjax(): baseUrl = 'http://ajax.shuqiapi.com/?bamp=sqphcm&desc_type=3&page=' tailUrl = '&tk=NDE3YWM1OWU5Zg%253D%253D' page = 1 import json for page in range(86, 120): url = baseUrl + str(page) + tailUrl jsonContent = getContentWithUA(url,ua) jsonC = json.loads(jsonContent.encode('utf-8')) for book in jsonC['data']['ph']['book_list']: bookId = book['id'] try: start(bookId) except Exception as e: print 'book ',bookId,' error: ', e
def initCap(): sqCat = dict() for i in range(0,800): url = 'http://api.shuqireader.com/reader/bc_storylist.php?pagesize=40&PageType=category&item=allclick' \ '&pageIndex=1&cid=' \ + str(i) + capListAPIDeviceInfo text = getContentWithUA(url, ua) if not (text and len(text) > 60 ): continue root = ElementTree.fromstring(text.encode('utf-8')) # 获取element的方法 # 1 通过getiterator node = root.getiterator("Book")[0] parentName = node.attrib['ParentTypeName'] ParentTypeId = node.attrib['ParentTypeId'] TypeName = node.attrib['TypeName'] print TypeName,ParentTypeId, parentName tag = dict() tag['TypeName'] = TypeName # tag['parentName'] = parentName # tag['ParentTypeId'] = ParentTypeId tag['cid'] = i if not sqCat.has_key(parentName): chidren = [] chidren.append(tag) top = dict() top['id'] = ParentTypeId top['children'] = chidren sqCat[parentName] = top else: sqCat[parentName]['children'].append(tag) # sqCat[i] = tag f = open('shuqCategory.yaml', 'wb') yaml.dump(sqCat, f) f.close()
def changeSouceIds(): bookObjs = getMianAllBookBaseObjs() for bookObj in bookObjs: try: foundNewId = False title = bookObj['title'] author = bookObj['author'] source = bookObj['source'] bookId = bookObj['id'] searchUrl = MianFeiTXTSearchBaseUrl + '?' + paramMap().mianfeiTXT()\ .put('keyword', (title + author).encode('utf-8'))\ .put('pageSize', '10').put('pageNum', '1').put('type', '1')\ .mianfeiTXTSign() \ .toUrl() # time.sleep(random.) r = requests.get(searchUrl) searchRes = json.loads(r.text) for resBook in searchRes['data']['books']: resTitle = resBook['name'] if resTitle != title: continue resAuthor = resBook['author'] if resAuthor != author: continue resId = resBook['id'] if str(resId) == str(source): myLogging.info('WTF: id no change?, bookId: %s, orgSoueceId: %s, newId: %s', bookId, source, resId) latestChapObj = getLatestChapByBookId(bookId) if not latestChapObj: myLogging.error('no chaps in db yet, bookId: %s, new mid: %s', bookId, resId) updateOneFieldByOneField('source', resId, 'id', bookId) foundNewId = True break cid = latestChapObj['idx'] chapTitle = latestChapObj['title'] capContentUrl = MianFeiTXTChapBaseUrl + '?' + paramMap().mianfeiTXT().mBookId(resId).mChapId( cid).mianfeiTXTSign().toUrl() capContent = getContentWithUA(capContentUrl) if not capContent: capContent = getContentWithUA(capContentUrl) # capContent = capContent.replace(r'\r', '').replace(r'\n', '') capListJsonObj = json.loads(capContent, strict=False) if not (capListJsonObj['returnCode'] == '0000'): capListJsonObj = json.loads(capContent) if not (capListJsonObj['returnCode'] == '0000' and capListJsonObj['returnMsg'] == u'成功'): myLogging.error('get chap detail fail mid: %s, cid: %s', resId, cid) continue chapterName = capListJsonObj['data']['bookChapter']['chapterName'] if chapterName == chapTitle: myLogging.info('bookId %s change source from %s to %s', bookId, source, resId) updateOneFieldByOneField('source', resId, 'id', bookId) foundNewId = True break if not foundNewId: myLogging.error('bookId %s did not find new id !!!,title: %s, author: %s, org source: %s', bookId, title, author,source ) except Exception as e: myLogging.error(traceback.format_exc())
def handlChapsByBookObjZidBocId(bookObj, zid, chapListObj, allowUpdate=False): # chapListObj = getChapsByBocId(bocId) resInx = 0 #保存最终更新到的下标 # chapListObj = getChapObjs(bookObj) if not chapListObj: myLogging.error('zid %s get chaps list null', zid) return resInx if not chapListObj.has_key('chapters'): myLogging.error('zid %s chaps list no data', zid) return resInx capIdxs = set() capTitles = set() if allowUpdate: capIdxs = getCapIdxsByBookId(bookObj['id']) # 已在库中的章节下标 capTitles = getChapTitlesByBookId(bookObj['id']) # 已在库中的章节下标 for idx in range(0, len(chapListObj['chapters'])): try: # if idx in capIdxs: # continue chapObj = chapListObj['chapters'][idx] if chapObj['title'] in capTitles: continue if idx in capIdxs: continue chapObj['cid'] = chapObj['link'] if chapObj.has_key('id'): chapObj['cid'] = chapObj['id'] chapObj['idx'] = idx chapContentUrl = ZSSQCHAPCONTENTBASEURL + quote(chapObj['link']) chapContentText = getContentWithUA(chapContentUrl) if not chapContentText: myLogging.error( 'zid: %s, dbid: %s, chapId: %s, get chapContent null ', zid, bookObj['id'], chapObj['cid']) continue chapContentObj = json.loads(chapContentText) if not chapContentObj or not chapContentObj.has_key('chapter'): myLogging.error( 'zid: %5, dbid: %s, chapId: %s, get no chapter ', zid, bookObj['id'], chapObj['cid']) continue if u'.' == chapContentObj['chapter']['title'] or len( chapContentObj['chapter']['title']) < 2: del chapContentObj['chapter']['title'] chapObj.update(chapContentObj['chapter']) chapObj['content'] = chapObj['body'] if chapObj.has_key('cpContent'): chapObj['content'] = chapObj['cpContent'] del chapObj['cpContent'] chapObj['content'] = textClean(chapObj['content']) if len(chapObj['content']) < MinChapContentLength: myLogging.error('zid %s cid %s content too small skip', zid, chapObj['cid']) continue del chapObj['body'] del chapObj['link'] chapObj['rawUrl'] = chapContentUrl # capObj['size'] = int(WordsCount) chapObj['size'] = len(chapObj['content']) chapObj['bookId'] = bookObj['id'] chapObj['source'] = bookObj['source'] chapObj['bookUUID'] = bookObj['digest'] digest = getCapDigest(bookObj, chapObj, chapObj['cid']) chapObj['digest'] = digest capId = insertCapWithCapObj(chapObj) # aftInsertCap = time.time() # insertCap = insertCap + (aftInsertCap - befInsertCap) if not capId: continue uploadJson2Bucket(str(capId) + '.json', json.dumps(chapObj)) resInx = max(resInx, idx) # aftUploadCap = time.time() # uploadCap = uploadCap + (aftUploadCap - aftInsertCap) except Exception as e: myLogging.error('zid: %, dbid: %s, idx: %s, get exception ', zid, bookObj['id'], idx) myLogging.error(traceback.format_exc()) return resInx
def getBookObjBiQid(qid, srcId = None, allowUpdate=False): if not srcId: srcId = getSourceId(qid) # categDict = shuqCategory bookInfoUrl = bookInfoBaseUrl % (qid, srcId) bookInfoContent = getContentWithUA(bookInfoUrl) bookInfoObj = json.loads(bookInfoContent) bookObj = bookInfoObj['items'][0] bookObj['title'] = bookObj['name'] bookObj['subtitle'] = bookObj['desc'] bookObj['imgUrl'] = checkDefaultImg(bookObj['img_url']) if bookObj['status'] == 'SERIALIZE': bookObj['bookType'] = u'连载' else: bookObj['bookType'] = u'完结' bookObj['rawUrl'] = bookInfoUrl bookObj['category'] = bookObj['labels'] bookObj['categoryCode'] = getClassifyCodeByName(bookObj['category'])['categoryCode'] # if categDict.has_key(bookObj['category']): # if categDict[bookObj['category']]['id'] and len(categDict[bookObj['category']]['id']) > 0: # bookObj['categoryCode'] = int(categDict[bookObj['category']]['id']) bookObj['type'] = bookObj['category'] bookObj['typeCode'] = 0 classObj = getClassifyCodeByName(bookObj['type']) if 0 != classObj['typeCode']: bookObj['typeCode'] = classObj['typeCode'] bookObj['categoryCode'] = classObj['categoryCode'] bookObj['source'] = qid + '/' + srcId chapListUrl = chapListBaseUrl % (qid, srcId) chapListContent = getContentWithUA(chapListUrl) chapListObj = json.loads(chapListContent) chapNum = len(chapListObj['items']) bookObj['chapterNum'] = chapNum if bookObj['chapterNum'] < MINCHAPNUM: myLogging.error('chap num too small skip, bookId %s', qid) return bookObj['size'] = chapNum * random.randint(1000, 3000) bookObj['viewNum'] = chapNum * random.randint(20000, 30000) bookObj = insertBookWithConn(bookObj, allowUpdate) if not bookObj: myLogging.error('null bookObj after insert Book to db, bookId %s', qid) return for chapObj in chapListObj['items']: try: handlChapByBookObjChapObj(allowUpdate, bookObj, chapObj) except Exception as e: myLogging.error(traceback.format_exc())
def getShuqiCapList(bookId): capList = [] pageIndex = 1 capListAPIBase = 'http://api.shuqireader.com/reader/bc_chapter.php?pagesize=40&bookId=' capListAPI = capListAPIBase + str(bookId) + '&pageIndex=' + str(pageIndex) + capListAPIDeviceInfo text = getContentWithUA(capListAPI, ua) if not (text and len(text) > 160): return root = ElementTree.fromstring(text.encode('utf-8')) gatherId = root.getiterator('BookInfos')[0].attrib['GatherId'] TotalCount = int(root.getiterator('BookInfos')[0].attrib['TotalCount']) # if TotalCount > 40: topPageCount = TotalCount / 40 + 2#分页的总数 for i in range(1, topPageCount):#如果没有分页,i只会等于1 if i == 1: pageRoot = root else: pageApi = capListAPIBase + str(bookId) + '&pageIndex=' + str(i) + capListAPIDeviceInfo pageText = getContentWithUA(pageApi, ua) if not (pageText and len(pageText) > 160): return pageRoot = ElementTree.fromstring(pageText.encode('utf-8')) if gatherId and gatherId != '':#有子目录 for book in pageRoot.getiterator('Book'): vId = book.attrib['ChapterId'] secondApi = capListAPI + '&vid=' + str(vId)#子目录的url textSon = getContentWithUA(secondApi, ua)#子目录的内容 xmlSon = ElementTree.fromstring(textSon.encode('utf-8')) #子目录的xml sonTotalCount = int(xmlSon.getiterator('BookInfos')[0].attrib['TotalCount']) #子目录的记录总数 sonPageCount = sonTotalCount / 40 + 2#子目录的分页总数 for j in range(1, sonPageCount):#遍历子目录的每一页,如果没有分页,会只遍历第一页 if j == 1: sonpageRoot = xmlSon#第一页不需要再请求url else: morePageApi = capListAPIBase + str(bookId) + '&pageIndex=' + str(j) + capListAPIDeviceInfo \ + '&vid=' + str(vId)#子目录的分页url morePageText = getContentWithUA(morePageApi, ua) sonpageRoot = ElementTree.fromstring(morePageText.encode('utf-8')) # 子目录的xml for realCap in sonpageRoot.getiterator('Book'): realCapId = realCap.attrib['ChapterId'] chapTitle = realCap.attrib['BookChapter'] chapObj = {"cid": realCapId, 'title': chapTitle} # dealCap(bookId, realCapId) capList.append(chapObj) else:#没有二级目录,不需要请求信的api,所有不需考虑分页 for realCap in pageRoot.getiterator('Book'): realCapId = realCap.attrib['ChapterId'] chapTitle = realCap.attrib['BookChapter'] chapObj = {"cid": realCapId, 'title': chapTitle} # dealCap(bookId, realCapId) capList.append(chapObj) return capList
def getBocObjsByZid(zid): getbocBaseUrl = 'http://api.zhuishushenqi.com/atoc?view=summary&book=' botText = getContentWithUA(getbocBaseUrl + zid) bocObjs = json.loads(botText) return bocObjs
def getCapContentObj(bookId, capId,mysqlBKid): pageIndex = 1 capListAPIBase = 'http://api.shuqireader.com/reader/bc_showchapter.php?bookId=' capApi = capListAPIBase + str(bookId) + '&chapterid=' + str(capId) + '&pageIndex=' + str(pageIndex) \ + '&bg=0' + capListAPIDeviceInfo capText = getContentWithUA(capApi, ua) capObj = dict() capObj['bookFail'] = False #标识是否整本书不可抓,如果是就没必要抓后面的章节了 if not capText : print 'cap content none' return None if not len(capText) > 30: print 'cap content too short ,skip and del book' delBookById(mysqlBKid) capObj['bookFail'] = True return capObj capRoot = ElementTree.fromstring(capText.encode('utf-8')) ChapterName = '' if len(capRoot.getiterator('ChapterName')) > 0: ChapterName = capRoot.getiterator('ChapterName')[0].text ChapterContent = '' if len(capRoot.getiterator('ChapterContent')) > 0: ChapterContent = capRoot.getiterator('ChapterContent')[0].text if not ChapterContent: capText = getContentWithUA(capApi, ua) if not capText: print 'cap content none' return None if not len(capText) > 30: print 'cap content too short ,skip and del book' delBookById(mysqlBKid) capObj['bookFail'] = True return capObj capRoot = ElementTree.fromstring(capText.encode('utf-8')) ChapterName = '' if len(capRoot.getiterator('ChapterName')) > 0: ChapterName = capRoot.getiterator('ChapterName')[0].text ChapterContent = '' if len(capRoot.getiterator('ChapterContent')) > 0: ChapterContent = capRoot.getiterator('ChapterContent')[0].text if not ChapterContent: return None ChapterContent = ChapterContent.strip() if(ChapterContent.startswith('http') and len(ChapterContent) < 250): print 'cap content is url ,skip and del book', bookId, ' : ',ChapterContent delBookById(mysqlBKid) capObj['bookFail'] = True return capObj WordsCount = '' if len(capRoot.getiterator('WordsCount')) > 0: WordsCount = capRoot.getiterator('WordsCount')[0].text PageCount = 1 if len(capRoot.getiterator('PageCount')) > 0: PageCount = int(capRoot.getiterator('PageCount')[0].text) if PageCount > 1: for i in range(2, PageCount + 1): pageIndex = i capApi2 = capListAPIBase + str(bookId) + '&chapterid=' + str(capId) + '&pageIndex=' + str( pageIndex) + '&bg=0' + capListAPIDeviceInfo capText2 = getContentWithUA(capApi2, ua) if not (capText2 and len(capText2) > 160): return capRoot2 = ElementTree.fromstring(capText2.encode('utf-8')) ChapterContent2 = '' if len(capRoot2.getiterator('ChapterContent')) > 0: ChapterContent2 = capRoot2.getiterator('ChapterContent')[0].text ChapterContent = ChapterContent + ChapterContent2 capObj['content'] = ChapterContent.replace(u'***求收藏***','').replace(u'***(求收藏)***','').replace(u'求收藏','') capObj['title'] = ChapterName capObj['rawUrl'] = capApi[0:200] # capObj['size'] = int(WordsCount) capObj['size'] = len(capObj['content']) return capObj
def handleCapsByBookObj(allowUpdate, bookObj, count, mid, startCapIdx = 1): capIdxs = set() if allowUpdate: capIdxs = getCapIdxsByBookId(bookObj['id']) # 已在库中的章节下标 # myBookId = bookObj['id'] # # startCap = time.time() crawlParseSpent = 0 insertCap = 0 uploadCap = 0 succCapTimes = 1 resIdx = startCapIdx for cid in range(0, count + 1): try: if allowUpdate: if cid in capIdxs: continue # 该章节已在库中,跳过 # else: # startCap = time.time() befCrawl = time.time() succCapTimes = succCapTimes + 1 # capContentUrl = MianFeiContentBaseUrl + str(cid) + '&contentid=' + str(mid) capContentUrl = MianFeiTXTChapBaseUrl + '?' + paramMap().mianfeiTXT().mBookId(bookObj['source']).mChapId( cid).mianfeiTXTSign().toUrl() capContent = getContentWithUA(capContentUrl, ua) if not capContent: capContent = getContentWithUA(capContentUrl, ua) # capContent = capContent.replace(r'\r', '').replace(r'\n', '') capListJsonObj = json.loads(capContent, strict=False) if not (capListJsonObj['returnCode'] == '0000'): capListJsonObj = json.loads(capContent) if not (capListJsonObj['returnCode'] == '0000' and capListJsonObj['returnMsg'] == u'成功'): resIdx = min(cid, resIdx) myLogging.info('chap content null ,RETURN, capId:' + str(cid) + ' mid: ' + str(mid)) return resIdx # 原api接口更新不及时,为了配合后来的 无限向前重试方法,在这跳出 capObj = dict() orgContent = capListJsonObj['data']['bookChapter']['content'] contentSoup = getSoupByStr(orgContent) if not contentSoup or '' == orgContent or len(orgContent) < 1: myLogging.error('chap content null ,RETURN, capId:' + str(cid) + ' mid: ' + str(mid)) resIdx = min(cid, resIdx) return resIdx #原api接口更新不及时,为了配合后来的 无限向前重试方法,在这跳出 if contentSoup.body['style']: del contentSoup.body['style'] content = unicode(contentSoup.body).replace(u'<body>', '').replace(u'</body>', '').replace(u'\n\n', u'\n').replace( u'<br><br>', u'<br>').replace(u'<br\><br\>', u'<br\>') capObj['content'] = textClean(content) capObj['title'] = unicode(contentSoup.title.get_text()) capObj['rawUrl'] = capContentUrl # capObj['size'] = int(WordsCount) capObj['size'] = len(content) capObj['bookId'] = bookObj['id'] capObj['source'] = bookObj['source'] capObj['idx'] = cid capObj['bookUUID'] = bookObj['digest'] digest = getCapDigest(bookObj, capObj, cid) capObj['digest'] = digest befInsertCap = time.time() crawlParseSpent = crawlParseSpent + (befInsertCap - befCrawl) capId = insertCapWithCapObj(capObj) aftInsertCap = time.time() insertCap = insertCap + (aftInsertCap - befInsertCap) if not capId: continue uploadJson2Bucket(str(capObj['id']) + '.json', json.dumps(capObj)) aftUploadCap = time.time() uploadCap = uploadCap + (aftUploadCap - aftInsertCap) resIdx = max(cid, resIdx) except Exception as e: myLogging.error('crawl' + str(mid) + ' cap ' + str(cid) + ' exception: ' + str(e)) resIdx = min(cid, resIdx) if succCapTimes > 1: succCapTimes = succCapTimes - 1 myLogging.info( 'crawlParse avg: ' + str(float(crawlParseSpent) / float(succCapTimes)) + \ ' insert avg: ' + str(float(insertCap) / float(succCapTimes)) + \ ' upload avg: ' + str(float(uploadCap) / float(succCapTimes))) return resIdx
def crawlCurrentBookObj(mid): # url = MianFeiTXTBaseUrl + str(mid) url = MianFeiTXTBookBaseUrl + '?' + paramMap().mianfeiTXT().mBookId(mid).mianfeiTXTSign().toUrl() baseInfoContent = getContentWithUA(url, ua) if not baseInfoContent: baseInfoContent = getContentWithUA(url, ua) baseObj = json.loads(baseInfoContent) baseData = baseObj['data']['book'] author = baseData['author'] title = baseData['name'] coverUrl = baseData['coverUrl'] # contentUrl = baseData['contentUrl'] count = baseData['latestChapterCount'] #不准,更新不及时 if count < MINCHAPNUM: myLogging.warning( 'chapNum too small, skip %s, return', str(mid)) return None, None # isOver = baseData['isOver'] BookType = baseData['serialStatus'] # if isOver == 1: # BookType = '完结' # bookDetailHtml = getContentWithUA(MianFeiTXTBookDetailUrl + str(mid), ua) # bookDetailSoup = getSoupByStr(bookDetailHtml) # bookDesc = bookDetailSoup.select_one('#J-desc').get_text().replace('\n', '').replace('\t\t', '\t') # bookLabels = [] # for span in bookDetailSoup.select('#J-lables-items span'): # bookLabels.append(span.get_text()) bookObj = dict() bookObj['subtitle'] = baseData['summary'] bookObj['source'] = "" + str(mid) bookObj['rawUrl'] = MianFeiTXTBaseUrl + str(mid) bookObj['title'] = title bookObj['chapterNum'] = count #更新不及时 bookObj['imgUrl'] = 'http://oss-public.antehao.cn/' + coverUrl bookObj['author'] = author bookObj['size'] = baseData['words'] bookObj['category'] = baseData['secondCategory'] # if len(bookLabels) > 0: # bookObj['category'] = bookLabels[0] bookObj['type'] = baseData['thirdCategory'] # if len(bookLabels) > 0: # bookObj['type'] = bookLabels[0] # if len(bookLabels) > 1: # bookObj['type'] = bookLabels[1] bookObj['bookType'] = BookType bookObj['categoryCode'], bookObj['typeCode'], bookObj['category'] = getCategoryAndTypeCode(bookObj['category'], bookObj['type']) # bookObj['typeCode'] = 0 # bookObj['categoryCode'] = 0 bookObj['viewNum'] = random.randint(500000, 1000000) #获取最新章节下标,作为另一个判断更新的条件 bookObj['latestCapIndex'] = min(baseData['latestChapterId'], 200000) # try: # # capExamples = bookDetailSoup.select('.J-category-li') # if capExamples and len(capExamples) > 2: # bookObj['latestCapIndex'] = int(capExamples[2]['id'])#就要第三个,有时候共有3个,有时共有6个 # # except Exception : # myLogging.warning(traceback.format_exc()) return bookObj, count