def getContentByUrl(url): capText = getContentWithUA(url, ua) if not (capText and len(capText) > 30): print 'cap content too short ,skip and del book' return None capRoot = ElementTree.fromstring(capText.encode('utf-8')) # ChapterName = '' # if len(capRoot.getiterator('ChapterName')) > 0: # ChapterName = capRoot.getiterator('ChapterName')[0].text ChapterContent = '' if len(capRoot.getiterator('ChapterContent')) > 0: ChapterContent = capRoot.getiterator('ChapterContent')[0].text # if ('http://' in ChapterContent and len(ChapterContent) < 250): # print 'cap content is url ,skip and del book', bookId, ' : ', ChapterContent # delBookById(bookId) # return None WordsCount = '' # if len(capRoot.getiterator('WordsCount')) > 0: # WordsCount = capRoot.getiterator('WordsCount')[0].text PageCount = 1 if len(capRoot.getiterator('PageCount')) > 0: PageCount = int(capRoot.getiterator('PageCount')[0].text) if PageCount > 1: for i in range(2, PageCount + 1): pageIndex = i capApi2 = url.replace( 'pageIndex=' + str(pageIndex - 1), 'pageIndex=' + str(pageIndex - 1) ) #capListAPIBase + str(bookId) + '&chapterid=' + str(capId) + '&pageIndex=' + str( # pageIndex) + '&bg=0' + capListAPIDeviceInfo capText2 = getContentWithUA(capApi2, ua) if not (capText2 and len(capText2) > 160): return capRoot2 = ElementTree.fromstring(capText2.encode('utf-8')) ChapterContent2 = '' if len(capRoot2.getiterator('ChapterContent')) > 0: ChapterContent2 = capRoot2.getiterator( 'ChapterContent')[0].text if ChapterContent == ChapterContent2: break ChapterContent = ChapterContent + ChapterContent2 return ChapterContent
def dealById(baseUrl, conn, csor, id): # slp = random.randint(1, 100) # time.sleep(0.01 * slp) url = baseUrl + str(id) + '.json' content = getContentWithUA(url, ua) if not content or len(content) < 60: print id, 'content', content # continue return jsonObj = json.loads(content) data = jsonObj['data'][0] if not data or len(str(data)) < 10: print id, 'data:', data return # continue companyType = data['companyType'] webName = data['webName'] companyName = data['companyName'] liscense = data['liscense'] examineDate = data['examineDate'].strip() webSite = ','.join(data['webSite']) # sql = """insert ignore into com_base (id,companyName,companyType,examineDate,liscense,source,webSite,webName) values (%s,%s,%s,%s,%s,%s,%s,%s);""" % (str(id), companyName, companyType,examineDate, liscense, "tianyacha",webSite,webName) try: csor.execute( """insert ignore into com_base (id,companyName,companyType,examineDate,liscense,source,webSite,webName) values (%s,%s,%s,%s,%s,%s,%s,%s);""", (str(id), companyName, companyType, examineDate, liscense, "tianyacha", webSite, webName)) conn.commit() except Exception as e: # # 发生错误时回滚 print e
def koolearn(muluUrl, stage): global conn, csor if not conn or (not csor): conn, csor = getTmathConnCsor() muluHtmlContent = getContentWithUA(muluUrl, defaultPCUa) muluSoup = getSoupByStr(muluHtmlContent) for pageLi in muluSoup.select('.list01 ul li'): try: title = pageLi.select_one('h3').get_text() if u'下载' in title: continue descTag = pageLi.select_one('.js2 p') if not descTag: descTag = pageLi.select_one('.js p') desc = descTag.get_text() tags = pageLi.select_one('.c_lv')['title'] ntype = tags #从标签中选择一个具有代表性的作为类型,一般为第二个 if len(tags) > 3: ts = tags.split(' ') if len(ts) > 2: ntype = ts[1] contentUrl = pageLi.select_one('h3 a')['href'] kooleanStartByContentUrl(conn, contentUrl, csor, desc, ntype, stage, tags, title) except Exception as ee: print traceback.format_exc() #获取下一页 footLinks = muluSoup.select('#page a') nextUrl = footLinks[len(footLinks) - 1]['href'] koolearn(urlparse.urljoin(muluUrl, nextUrl), stage)
def getBookCommentList(sqbookId): baseUrl = 'http://api1.shuqireader.com/reader/bc_bbs_interface.php?bid=' midUrl = '&bbs=see&bbs_num=20&bbs_rand_num=' commentList = [] for i in range(1, 4): url = baseUrl + str(sqbookId) + midUrl + str(i) + capListAPIDeviceInfo commentText = getContentWithUA(url, ua) if not (commentText and len(commentText) > 30): print 'cap content too short ' break capRoot = ElementTree.fromstring(commentText.encode('utf-8')) for comment in capRoot.getiterator('Bbs'): commentList.append(comment.attrib) # BbsId = comment.attrib['BbsId'] # BbsIdUserName = comment.attrib['BbsIdUserName'] # BbsIdUserId = comment.attrib['BbsIdUserId'] # BbsContent = comment.attrib['BbsContent'] # BbsTime = comment.attrib['BbsTime'] # # commentObj = json.dumps(comment.attrib) # # BbsId = comment.attrib['BbsId'] # BbsId = comment.attrib['BbsId'] # BbsId = comment.attrib['BbsId'] # BbsId = comment.attrib['BbsId'] return commentList
def fromPapper(): papperListUrl = 'http://api.jyeoo.com/v1/math2/report?b=0&s=0&g=%s&t=0&e=0&r=&y=0&x=&pi=%s&ps=20&po=2' papperDetailBaseUrl = 'http://api.jyeoo.com/v1/math2/report/%s?ia=false' questDetailUrl = 'http://api.jyeoo.com/math2/AppTag/GetQues/%s' AuthCode = 'Token 9F5BBF8F752F060B00D38F7C81686852695A463CD5661FE0848CBEADB3ACFD5EE96B0D3FB81C8FEB1' \ 'EEC4F7CDA82D9DEE1603C3FED9A10DCD04FF9A5D4A677589F8891C0CA24ECB55A50EA11FFE8AA1B6F389D23A42B' \ '46E9529444F65FC72870C19AA1299F39C3809B3FB1C8D12B4C5E179FF3DA7ADB9AF5F8D40C95FC5418FE30CE3D884A' \ '52DA1CCC9AAB43AC1DCC501FBE1936820E5D73' Cookie = 'jyean=5x1Tibi9YuqrgS_gvFCeIr04zbNLyVJh9OK3GtKXzwblw1fjXNmqyzh5Facz4VuQKP20e1BJjK' \ 'PgCqVHWiQ7nlBmyQoYE3JEwEFmlp60djjncxNj2m4iSwj8YnOHXa0p0;jy=6B882BD2C4626BDCBD' \ '1A156DAEF3B6A149459A79829709858932A0831D669F2E504EB7C714F40360013A05356D0F284759128FC' \ '09556AA4C66DB25AF5F6C5E8CA16BB1D5261C4B4C74C002D90BE6C0103D6B80DC270249B19D933EFED5E85651E2817' \ '5AD8FDE7C21D6373B64C8276A4E25D88987C37AC54A91A8A44888540B25163F05F330F8E7A88991394DAFE124159DE8407' \ 'C8256AE9AE7CE4C8937AF95418BA780A2AEB99EF452B60E765A607BDCF94CF605D5D3BD058E9BE846875E6C2E2A587BE80' \ '55436E5FD290661F6F3FEB41EF00CA118E16E13B42F509ED690F7038DA498DA9EA0A39E5F6A377E409A5230CA67C9B7C' \ 'A9A00B3356D77346878D2B78188D1F3D17F48619D51D6C9158C6491C96423357206B7BDF1FFD7A2C4A34C334F8EE97ED' \ '32FE7E075315375AAACDEC9B8AA17AF3F367827930B803BD060A685F8693E318F7782663D9C18F84753229011B6D' \ '356BD26835F31CAD0F65B1DE78D915FE08D2FBEA480574BF9431C2DF9AD;' header = dict() header['Authorization'] = AuthCode header['Cookie'] = Cookie for g in range(1,13): for pageSize in range(0, 150): pListContent = getContentWithUA(papperListUrl % (g,pageSize), headers=header) if pListContent: pListJson = json.loads(pListContent) for papper in pListJson['Data']: ppId = papper['ID'] papperDetailContent = getContentWithUA(papperDetailBaseUrl % ppId, headers=header) if not papperDetailContent: print 'papper detail failed continue' continue papperDetailJson = json.loads(papperDetailContent) ppTitle = papperDetailJson['Title'] Score = papperDetailJson['Score'] SchoolName = papperDetailJson['SchoolName'] Degree = papperDetailJson['Degree'] for partJson in papperDetailJson['Groups']: partName = partJson['Key']
def startFromCId(p, queue): baseUrl = 'http://api.shuqireader.com/reader/bc_storylist.php?pagesize=40&PageType=category&item=allclick&pageIndex=' cc = '&cid=' page = 1 shuqCategory = loadShuQSeqC() shuqCategory2 = loadShuQC() totleSize = 220 for cid in shuqCategory.keys(): try: url = baseUrl + str(page) + cc + str(cid) + capListAPIDeviceInfo urlContent = getContentWithUA(url, ua) if not (urlContent and len(urlContent) > 30): continue capRoot = ElementTree.fromstring(urlContent.encode('utf-8')) totleSize = int(capRoot.attrib['TotalCount']) / 40 + 1 try: dealBookListUrlContentMT(p, queue, shuqCategory2, urlContent) for page in range(totleSize, 0, -1): url = baseUrl + str(page) + cc + str( cid) + capListAPIDeviceInfo urlContent = getContentWithUA(url, ua) if not (urlContent and len(urlContent) > 30): continue dealBookListUrlContentMT(p, queue, shuqCategory2, urlContent) except Exception as e1: print 'deal one page error, cid: ', cid, ' page: ', page except Exception as e: print "cid : ", cid, 'error: ', e
def getContentFromXml(bookId, capId, xml): pageIndex = 1 capListAPIBase = 'http://api.shuqireader.com/reader/bc_showchapter.php?bookId=' # capApi = capListAPIBase + str(bookId) + '&chapterid=' + str(capId) + '&pageIndex=' + str( # pageIndex) + '&bg=0' + capListAPIDeviceInfo # capText = getContentWithUA(capApi, ua) # if not (capText and len(capText) > 30): # print 'cap content too short ,skip and del book' # delBookById(bookId) # return None capRoot = ElementTree.fromstring(xml.encode('utf-8')) # ChapterName = '' # if len(capRoot.getiterator('ChapterName')) > 0: # ChapterName = capRoot.getiterator('ChapterName')[0].text ChapterContent = '' if len(capRoot.getiterator('ChapterContent')) > 0: ChapterContent = capRoot.getiterator('ChapterContent')[0].text # if ('http://' in ChapterContent and len(ChapterContent) < 250): # print 'cap content is url ,skip and del book', bookId, ' : ', ChapterContent # delBookById(bookId) # return None WordsCount = '' # if len(capRoot.getiterator('WordsCount')) > 0: # WordsCount = capRoot.getiterator('WordsCount')[0].text PageCount = 1 if len(capRoot.getiterator('PageCount')) > 0: PageCount = int(capRoot.getiterator('PageCount')[0].text) if PageCount > 1: for i in range(2, PageCount + 1): pageIndex = i capApi2 = capListAPIBase + str(bookId) + '&chapterid=' + str( capId) + '&pageIndex=' + str( pageIndex) + '&bg=0' + capListAPIDeviceInfo capText2 = getContentWithUA(capApi2, ua) if not (capText2 and len(capText2) > 160): return capRoot2 = ElementTree.fromstring(capText2.encode('utf-8')) ChapterContent2 = '' if len(capRoot2.getiterator('ChapterContent')) > 0: ChapterContent2 = capRoot2.getiterator( 'ChapterContent')[0].text ChapterContent = ChapterContent + ChapterContent2 return ChapterContent
def updateByCategoryIdZongheng(catId): sql = 'SELECT id,rawUrl,digest from cn_dushu_book where categoryCode = ' + str( catId) + ' and rawUrl like "%zongheng%" ORDER BY id desc;' try: csor.execute(sql) conn.commit() except Exception as e: # # 发生错误时回滚 print 'mysql ex: ', e results = csor.fetchall() for book in results: bid = book[0] url = book[1] bookDigest = book[2] deleteCapsByBookId(bid) url = url.replace('com/book', 'com/showchapter') content = getContentWithUA(url, ua) soup = getSoupByStr(content) caps = soup.select('.chapterBean') if not caps: continue for i in range(0, len(caps)): cap = caps[i] capUrl = cap.select('a')[0]['href'] capName = cap.select('a')[0].get_text() content, host = getAndParse(capUrl) if not content: continue capObj = dict() capObj['title'] = capName capObj['rawUrl'] = capUrl capObj['source'] = '纵横' capObj['content'] = content capObj['bookId'] = bid capObj['idx'] = i m2 = hashlib.md5() forDigest = capName + u'#' + str(i) # forDigest = u'总裁我很忙#jxj季' m2.update(forDigest.encode('utf-8')) digest = m2.hexdigest() capObj['digest'] = digest capObj['size'] = len(content) capObj['bookUUID'] = bookDigest insertCapWithCapObj(capObj, conn, csor)
def startFromLatestAjax(): baseUrl = 'http://ajax.shuqiapi.com/?bamp=sqphcm&desc_type=3&page=' tailUrl = '&tk=NDE3YWM1OWU5Zg%253D%253D' page = 1 import json for page in range(86, 120): url = baseUrl + str(page) + tailUrl jsonContent = getContentWithUA(url, ua) jsonC = json.loads(jsonContent.encode('utf-8')) for book in jsonC['data']['ph']['book_list']: bookId = book['id'] try: start(bookId) except Exception as e: print 'book ', bookId, ' error: ', e
def initCap(): sqCat = dict() for i in range(0, 800): url = 'http://api.shuqireader.com/reader/bc_storylist.php?pagesize=40&PageType=category&item=allclick&pageIndex=1&cid=' \ + str(i) + capListAPIDeviceInfo text = getContentWithUA(url, ua) if not (text and len(text) > 60): continue root = ElementTree.fromstring(text.encode('utf-8')) # 获取element的方法 # 1 通过getiterator node = root.getiterator("Book")[0] parentName = node.attrib['ParentTypeName'] ParentTypeId = node.attrib['ParentTypeId'] TypeName = node.attrib['TypeName'] print TypeName, ParentTypeId, parentName tag = dict() tag['TypeName'] = TypeName # tag['parentName'] = parentName # tag['ParentTypeId'] = ParentTypeId tag['cid'] = i if not sqCat.has_key(parentName): chidren = [] chidren.append(tag) top = dict() top['id'] = ParentTypeId top['children'] = chidren sqCat[parentName] = top else: sqCat[parentName]['children'].append(tag) # sqCat[i] = tag f = open('shuqCategory.yaml', 'wb') yaml.dump(sqCat, f) f.close()
def searchAndCrawlByName(comName, proxy=None): if not comName: return None comName = comName.encode('utf-8') # baseUrl = 'http://www.qichacha.com/search?key=' + quote(comName) # baseUrl = 'http://www.qichacha.com/firm_CN_ea3a783f0c010fc31a2d75c2c9aa9b75' baseUrl = 'http://www.qichacha.com/firm_c3ece65bad28c17cc7f67168448e50e1.shtml' ua = random.choice(USER_AGENTS) htmlContent = getContentWithUA(baseUrl, ua, proxy=proxy) if not htmlContent: return None soup = getSoupByStrEncode(htmlContent) if not soup.select('ul.list-group a') or len( soup.select('ul.list-group a')) < 1: print htmlContent return None for uidTag in soup.select('ul.list-group a'): uid = uidTag['href'].replace('firm_', '') if uid == uidTag['href']: print 'not uid, skip', uidTag['href'] continue uid = uid.replace('.shtml', '').replace('/', '') prv = None if '_' in uid: strs = uid.split('_') prv = strs[0] uid = strs[1] # comName = uidTag.select_one('.text-lg').get_text() # comObj = dict() # comObj['uid'] = uid # comObj['comName'] = comName try: insertWithUid(conn, csor, prv, uid) except Exception as e: print 'insert with uid fail, uid:', uid # print comLink return 'ok'
def getInvestListByNameId(quid, qCname): if quid in investBloom: print 'invest aready done before, uid:', quid return None url = 'http://www.qichacha.com/company_getinfos?unique=' + quid + '&companyname=' + quote( qCname.encode('utf-8')) + '&tab=touzi' resList = [] htmlContent = getContentWithUA(url, ua) soup = getSoupByStrEncode(htmlContent) for uidTag in soup.select_one('.list-group-item'): uid = uidTag['href'].replace('firm_', '').replace('.shtml', '').replace('/', '') prv = None if '_' in uid: strs = uid.split('_') prv = strs[0] uid = strs[1] comName = uidTag.select_one('.text-lg').get_text() comObj = dict() comObj['uid'] = uid comObj['comName'] = comName insertWithUid(conn, csor, prv, quid) getInvestListByNameId(uid, comName) #递归下去 resList.append(comObj) # insertWithUid(conn,csor,None,quid) #入库 if len(resList) < 1: #没有投资记录 insertInvestList(quid, '') return resList
def getCapContentObj(bookId, capId, mysqlBKid): pageIndex = 1 capListAPIBase = 'http://api.shuqireader.com/reader/bc_showchapter.php?bookId=' capApi = capListAPIBase + str(bookId) + '&chapterid=' + str( capId) + '&pageIndex=' + str( pageIndex) + '&bg=0' + capListAPIDeviceInfo capText = getContentWithUA(capApi, ua) if not (capText and len(capText) > 30): print 'cap content too short ,skip and del book' # delBookById(mysqlBKid) return None capRoot = ElementTree.fromstring(capText.encode('utf-8')) ChapterName = '' if len(capRoot.getiterator('ChapterName')) > 0: ChapterName = capRoot.getiterator('ChapterName')[0].text ChapterContent = '' if len(capRoot.getiterator('ChapterContent')) > 0: ChapterContent = capRoot.getiterator('ChapterContent')[0].text if not ChapterContent: capText = getContentWithUA(capApi, ua) if not (capText and len(capText) > 30): print 'cap content too short ,skip and del book' # delBookById(mysqlBKid) return None capRoot = ElementTree.fromstring(capText.encode('utf-8')) ChapterName = '' if len(capRoot.getiterator('ChapterName')) > 0: ChapterName = capRoot.getiterator('ChapterName')[0].text ChapterContent = '' if len(capRoot.getiterator('ChapterContent')) > 0: ChapterContent = capRoot.getiterator('ChapterContent')[0].text if not ChapterContent: return None ChapterContent = ChapterContent.strip() if (ChapterContent.startswith('http') and len(ChapterContent) < 250): print 'cap content is url ,skip and del book', bookId, ' : ', ChapterContent delBookById(mysqlBKid) return None WordsCount = '' if len(capRoot.getiterator('WordsCount')) > 0: WordsCount = capRoot.getiterator('WordsCount')[0].text PageCount = 1 if len(capRoot.getiterator('PageCount')) > 0: PageCount = int(capRoot.getiterator('PageCount')[0].text) if PageCount > 1: for i in range(2, PageCount + 1): pageIndex = i capApi2 = capListAPIBase + str(bookId) + '&chapterid=' + str( capId) + '&pageIndex=' + str( pageIndex) + '&bg=0' + capListAPIDeviceInfo capText2 = getContentWithUA(capApi2, ua) if not (capText2 and len(capText2) > 160): return capRoot2 = ElementTree.fromstring(capText2.encode('utf-8')) ChapterContent2 = '' if len(capRoot2.getiterator('ChapterContent')) > 0: ChapterContent2 = capRoot2.getiterator( 'ChapterContent')[0].text ChapterContent = ChapterContent + ChapterContent2 capObj = dict() capObj['content'] = ChapterContent.replace(u'***求收藏***', '').replace( u'***(求收藏)***', '').replace(u'求收藏', '') capObj['title'] = ChapterName capObj['rawUrl'] = capApi[0:200] # capObj['size'] = int(WordsCount) capObj['size'] = len(capObj['content']) return capObj
def handleByMTID(mid): baseUrl = 'http://api.yingyangcan.com.cn/interface/ajax/book/getbaseinfo.ajax?contentid=' capListBaseUrl = 'http://api.yingyangcan.com.cn/interface/ajax/book/getcatalog.ajax?contentid=' + str(mid) \ +'&pageindex=1&pagesize=100000000' capContentBaseUrl = 'http://api.yingyangcan.com.cn/interface/ajax/book/getcharpter.ajax?chapterindex=' #2&contentid=171117' bookDetailUrl = 'http://m.yingyangcan.com.cn/interface/template/content/book_detail.vhtml?id=' url = baseUrl + str(mid) baseInfoContent = getContentWithUA(url, ua) if not baseInfoContent: baseInfoContent = getContentWithUA(url, ua) baseObj = json.loads(baseInfoContent) baseData = baseObj['data'] author = baseData['author'] title = baseData['name'] coverUrl = baseData['coverUrl'] contentUrl = baseData['contentUrl'] count = baseData['count'] isOver = baseData['isOver'] BookType = '连载' if isOver == 1: BookType = '完结' bookDetailHtml = getContentWithUA(bookDetailUrl + str(mid), ua) bookDetailSoup = getSoupByStr(bookDetailHtml) bookDesc = bookDetailSoup.select_one('#J-desc').get_text().replace( '\n', '').replace('\t\t', '\t') bookObj = dict() bookObj['subtitle'] = bookDesc bookObj['source'] = "" + str(mid) bookObj['rawUrl'] = url bookObj['title'] = title bookObj['chapterNum'] = count bookObj['imgUrl'] = coverUrl bookObj['author'] = author bookObj['size'] = count * 1000 bookObj['category'] = '仙侠' bookObj['type'] = '重生' bookObj['bookType'] = BookType bookObj['typeCode'] = 4 bookObj['categoryCode'] = 1 bookObj['viewNum'] = random.randint(500000, 1000000) m2 = hashlib.md5() forDigest = title + u'#' + author m2.update(forDigest.encode('utf-8')) digest = m2.hexdigest() bookObj['digest'] = digest bookObj = insertBookWithConn(bookObj, conn2, csor2) # myBookId = bookObj['id'] # for cid in range(1047, count + 1): capContentUrl = capContentBaseUrl + str(cid) + '&contentid=' + str(mid) capContent = getContentWithUA(capContentUrl, ua) if not capContent: capContent = getContentWithUA(capContentUrl, ua) capListJsonObj = json.loads(capContent) if not (capListJsonObj['status'] == 1000 and capListJsonObj['message'] == u'成功'): capListJsonObj = json.loads(capContent) if not (capListJsonObj['status'] == 1000 and capListJsonObj['message'] == u'成功'): continue capObj = dict() orgContent = capListJsonObj['data']['chapter'] contentSoup = getSoupByStr(orgContent) del contentSoup.body['style'] content = unicode(contentSoup.body).replace(u'<body>', '').replace( u'</body>', '').replace(u'\n\n', u'\n').replace( u'<br><br>', u'<br>').replace(u'<br\><br\>', u'<br\>') capObj['content'] = content capObj['title'] = unicode(contentSoup.title.get_text()) capObj['rawUrl'] = capContentUrl # capObj['size'] = int(WordsCount) capObj['size'] = len(content) capObj['bookId'] = bookObj['id'] capObj['source'] = bookObj['source'] capObj['idx'] = cid capObj['bookUUID'] = bookObj['digest'] m2 = hashlib.md5() forDigest = bookObj['digest'] + capObj['title'] + u'#' + str(cid) m2.update(forDigest.encode('utf-8')) digest = m2.hexdigest() capObj['digest'] = digest capId = insertCapWithCapObj(capObj, conn2, csor2) if not capId: continue upload2Bucket(str(capObj['id']) + '.json', json.dumps(capObj))
def kooleanStartByContentUrl(conn, contentUrl, csor, desc='', ntype='', stage='', tags='', title=''): detailHtmlContent = getContentWithUA(contentUrl, defaultPCUa) detailContentSoup = getSoupByStr(detailHtmlContent) detailContent = '' contentDiv = detailContentSoup.select_one('.show_l2 .mt40') contentDiv.select('p')[0].extract() # 第一个p标签为介绍,删掉 cps = contentDiv.select('p') for ci in range(0, len(cps)): if cps[ci].select('a'): print 'has link ,extract, contentUrl:' cps[ci].extract() if ci in [len(cps) - 1, len(cps) - 2, len(cps) - 3] and (u'新东方' in cps[ci].get_text() or u'来源' in cps[ci].get_text()): for cc in range(ci, len(cps)): cps[cc].extract() break detailContent = detailContent + unicode(contentDiv) # 如果有分页,不算最后一个回链页 for page in range(2, 100): cUrl = contentUrl.replace('.html', '_' + str(page) + '.html') moreContentHtmlContent = getContentWithUA(cUrl, defaultPCUa) if not moreContentHtmlContent: print 'no more content, ', cUrl break moreContentSoup = getSoupByStr(moreContentHtmlContent) # 去掉最后两个p moreContentDiv = moreContentSoup.select_one('.show_l2 .mt40') pps = moreContentDiv.select('p') for ci in range(0, len(pps)): if pps[ci].select('a'): # print 'has link ,extract, link:',unicode(pps[ci]),' contentUrl:',cUrl print 'has link ,extract, link2:' pps[ci].extract() if ci in [len(pps) - 1, len(pps) - 2, len(pps) - 3] and (u'新东方' in pps[ci].get_text() or u'来源' in pps[ci].get_text()): for cc in range(ci, len(pps)): pps[cc].extract() break # pps[len(pps) - 1].extract() # pps[len(pps) - 2].extract() [a.unwrap() for a in moreContentDiv.select('a')] for img in moreContentDiv.select('img'): if not img.has_key('style') or len(img['style']) < 1: img['style'] = 'max-width:100%' else: preStyle = img['style'] if preStyle.endswith(';'): img['style'] = img['style'] + 'max-width:100%;' else: img['style'] = img['style'] + ';max-width:100%' detailContent = detailContent + unicode(moreContentDiv) # 入库 csor.execute( 'insert ignore into daily_news_copy (name,type,content,stage,author,tag,contentUrl,description) VALUES (%s,' '%s,%s,%s,%s,%s,%s,%s)', (title, ntype, detailContent.replace(u'新东方在线论坛', '').replace( u'相关链接:', '').replace(u'来源:新东方在线论坛', '').replace(u'新东方在线', '').replace( u'新东方', ''), stage, u'新东方', tags, contentUrl, desc)) conn.commit()
def getShuqiCapList(bookId): capList = [] pageIndex = 1 capListAPIBase = 'http://api.shuqireader.com/reader/bc_chapter.php?pagesize=40&bookId=' capListAPI = capListAPIBase + str(bookId) + '&pageIndex=' + str( pageIndex) + capListAPIDeviceInfo text = getContentWithUA(capListAPI, ua) if not (text and len(text) > 160): return root = ElementTree.fromstring(text.encode('utf-8')) gatherId = root.getiterator('BookInfos')[0].attrib['GatherId'] TotalCount = int(root.getiterator('BookInfos')[0].attrib['TotalCount']) # if TotalCount > 40: topPageCount = TotalCount / 40 + 2 #分页的总数 for i in range(1, topPageCount): #如果没有分页,i只会等于1 if i == 1: pageRoot = root else: pageApi = capListAPIBase + str(bookId) + '&pageIndex=' + str( i) + capListAPIDeviceInfo pageText = getContentWithUA(pageApi, ua) if not (pageText and len(pageText) > 160): return pageRoot = ElementTree.fromstring(pageText.encode('utf-8')) if gatherId and gatherId != '': #有子目录 for book in pageRoot.getiterator('Book'): vId = book.attrib['ChapterId'] secondApi = capListAPI + '&vid=' + str(vId) #子目录的url textSon = getContentWithUA(secondApi, ua) #子目录的内容 xmlSon = ElementTree.fromstring( textSon.encode('utf-8')) #子目录的xml sonTotalCount = int( xmlSon.getiterator('BookInfos') [0].attrib['TotalCount']) #子目录的记录总数 sonPageCount = sonTotalCount / 40 + 2 #子目录的分页总数 for j in range(1, sonPageCount): #遍历子目录的每一页,如果没有分页,会只遍历第一页 if j == 1: sonpageRoot = xmlSon #第一页不需要再请求url else: morePageApi = capListAPIBase + str(bookId) + '&pageIndex=' + str(j) + capListAPIDeviceInfo \ + '&vid=' + str(vId)#子目录的分页url morePageText = getContentWithUA(morePageApi, ua) sonpageRoot = ElementTree.fromstring( morePageText.encode('utf-8')) # 子目录的xml for realCap in sonpageRoot.getiterator('Book'): realCapId = realCap.attrib['ChapterId'] # dealCap(bookId, realCapId) capList.append(realCapId) else: #没有二级目录,不需要请求信的api,所有不需考虑分页 for realCap in pageRoot.getiterator('Book'): realCapId = realCap.attrib['ChapterId'] # dealCap(bookId, realCapId) capList.append(realCapId) return capList
def today(): baseUrl = 'http://www.todayonhistory.com/' conn, csor = getTmathConnCsor() for month in range(1, 13): for day in range(1, 32): type = '全部' jsonurl = baseUrl + str(month) + '/' + str(day) htmlContent = getContentWithUA(jsonurl, defaultPCUa) if not htmlContent or u'404-历史上的今天' in htmlContent: print 'no content skip month:', str(month), ' day:', str(day) continue soup = getSoupByStr(htmlContent) if '404' in soup.title: print '404 skip month:', str(month), ' day:', str(day) continue listUl = soup.select_one('ul.oh') for listLi in listUl.select('li'): liClasses = listLi['class'] if 'typeid_53' in liClasses: type = u'纪念' elif 'typeid_54' in liClasses: type = u'节假日' elif 'typeid_55' in liClasses: type = u'逝世' elif 'typeid_56' in liClasses: type = u'出生' elif 'typeid_57' in liClasses: type = u'事件' solarYear = listLi.select_one('span[class="poh"]').get_text() link = listLi.select_one('a') if not link: print 'no link content, maybe bs4 bug, skip' continue contentUrl = link['href'] title = link['title'] contentText = '' imgUrl = '' imgTag = listLi.select_one('img') if imgTag: imgUrl = urlparse.urljoin(baseUrl, imgTag['src']) detailContentHtml = getContentWithUA(contentUrl, defaultPCUa) if detailContentHtml: contentSoup = getSoupByStr(detailContentHtml) contentBody = contentSoup.select_one('.body') n1 = contentBody.select_one('.page') if n1: n1.extract() n2 = contentBody.select_one('.keyword') if n2: n2.extract() n3 = contentBody.select_one('.extra') if n3: n3.extract() n4 = contentBody.select_one('.mgg') if n4: n4.extract() n5 = contentBody.select_one('.poh') if n5: n5.extract() n6 = contentBody.select_one('.framebox') if n6: n6.extract() # for divTag in contentBody.select('div'): # divTag.extract()# 去除多余的div contentText = unicode(contentBody) csor.execute( 'insert ignore into daily_today (name ,type ,content ' ',month ,day ,thumbImg ,solaryear,srcUrl) VALUES (%s,%s,%s,%s,%s,%s,%s,%s)', (title, type, contentText, month, day, imgUrl, solarYear, contentUrl)) conn.commit() jsonBaseUrl = 'http://www.todayonhistory.com/index.php?m=content&c=index&a=json_event&page=' #&pagesize=40&month=2&day=13' for page in range(1, 5): jsonurl = jsonBaseUrl + str( page) + '&pagesize=40&month=' + str(month) + '&day=' + str( day) jsonContent = getContentWithUA(jsonurl, defaultPCUa) if not jsonContent or len(jsonContent) < 10: print 'json url return null or too short, maybe finished' break jsonLists = json.loads(jsonContent) for jsonObj in jsonLists: tid = jsonObj['id'] contentUrl2 = jsonObj['url'] title = jsonObj['title'] thumb = urlparse.urljoin(baseUrl, jsonObj['thumb']) solaryear = jsonObj['solaryear'] contentText = '' detailContentHtml = getContentWithUA( contentUrl2, defaultPCUa) if detailContentHtml: contentSoup = getSoupByStr(detailContentHtml) contentBody = contentSoup.select_one('.body') # for divTag in contentBody.select('div'): # divTag.extract() # 去除多余的div n1 = contentBody.select_one('.page') if n1: n1.extract() n2 = contentBody.select_one('.keyword') if n2: n2.extract() n3 = contentBody.select_one('.extra') if n3: n3.extract() n4 = contentBody.select_one('.mgg') if n4: n4.extract() n5 = contentBody.select_one('.poh') if n5: n5.extract() n6 = contentBody.select_one('.framebox') if n6: n6.extract() n7 = contentBody.select_one('.mad') if n7: n7.extract() contentText = unicode(contentBody) csor.execute( 'insert ignore into daily_today (name ,type ,content ' ',month ,day ,thumbImg ,solaryear,srcUrl) VALUES (%s,%s,%s,%s,%s,%s,%s,%s)', (title, '全部', contentText, month, day, thumb, solaryear, contentUrl2)) conn.commit() print 'done month:', str(month), ' day: ', str(day)