def insertCap(bookObj, capUrl, capName, idx, queue): title = capName rawUrl = capUrl content, source = getAndParse(capUrl) if not (content and source): print 'no content got, fill with temp ,capUrl : ', capUrl content = '暂缺,请稍后再来' source = '' bookId = bookObj['id'] size = len(content) bookUUID = bookObj['digest'] import hashlib m2 = hashlib.md5() forDigest = capName + u'#' + source m2.update(forDigest.encode('utf-8')) digest = m2.hexdigest() capObj = dict() capObj['title'] = title capObj['rawUrl'] = rawUrl capObj['source'] = source capObj['content'] = content capObj['bookId'] = bookId capObj['idx'] = idx capObj['digest'] = digest capObj['size'] = size capObj['bookUUID'] = bookUUID queue.put(capObj)
def updateByCategoryIdZongheng(catId): sql = 'SELECT id,rawUrl,digest from cn_dushu_book where categoryCode = ' + str( catId) + ' and rawUrl like "%zongheng%" ORDER BY id desc;' try: csor.execute(sql) conn.commit() except Exception as e: # # 发生错误时回滚 print 'mysql ex: ', e results = csor.fetchall() for book in results: bid = book[0] url = book[1] bookDigest = book[2] deleteCapsByBookId(bid) url = url.replace('com/book', 'com/showchapter') content = getContentWithUA(url, ua) soup = getSoupByStr(content) caps = soup.select('.chapterBean') if not caps: continue for i in range(0, len(caps)): cap = caps[i] capUrl = cap.select('a')[0]['href'] capName = cap.select('a')[0].get_text() content, host = getAndParse(capUrl) if not content: continue capObj = dict() capObj['title'] = capName capObj['rawUrl'] = capUrl capObj['source'] = '纵横' capObj['content'] = content capObj['bookId'] = bid capObj['idx'] = i m2 = hashlib.md5() forDigest = capName + u'#' + str(i) # forDigest = u'总裁我很忙#jxj季' m2.update(forDigest.encode('utf-8')) digest = m2.hexdigest() capObj['digest'] = digest capObj['size'] = len(content) capObj['bookUUID'] = bookDigest insertCapWithCapObj(capObj, conn, csor)
def updateByBookId(id): res = getExistsCapsRawUrlId(int(id)) if not res: return for cap in res: cid = cap[0] url = cap[1] if not url or len(url) < 1: print cid, 'no url, skipp' break content, host = getAndParse(url) if not content: continue updateContentById(cid, content)
def updateCapFromTo(f, t): print 'from', str(f), ' to ', str(t) offset = 100 begin = f end = begin + offset while end <= t: # sql = "select id, rawUrl,bookId,content from cn_dushu_acticle where id >= %d and id < %d" % (begin, end) try: csor2.execute( "select id, rawUrl,bookId,content from cn_dushu_acticle where id >= %d and id < %d", (begin, end)) conn2.commit() except Exception as e: # # 发生错误时回滚 print 'mysql ex: ', e begin = begin + offset end = end + offset results = csor2.fetchall() for cap in results: cid = cap[0] capUrl = cap[1] bookId = cap[2] unclearContent = cap[3] if not (u' 言情小说_打造最新原创' in unclearContent or unclearContent == 'None'): continue try: if not capUrl or len(capUrl) < 1: print 'no url, bookId : ', bookId if 'shuqireader' in capUrl: content = getContentByUrl(capUrl) # updateContentById(cid, content) else: content, host = getAndParse(capUrl) if not content: continue updateContentById(cid, content) except Exception as e: print 'cid ', cid, 'error: ', e except ValueError as er: print 'cid ', cid, 'error: ', er
def handleCapUpload(cap): cid = cap[0] capUrl = cap[2] bookId = cap[5] unclearContent = cap[4] capObj = dict() capObj['id'] = cap[0] capObj['title'] = cap[1] capObj['rawUrl'] = cap[2] capObj['source'] = cap[3] capObj['content'] = cap[4] capObj['bookId'] = cap[5] capObj['idx'] = cap[6] capObj['digest'] = cap[7] capObj['size'] = cap[8] capObj['bookUUID'] = cap[9] content = unclearContent if unclearContent and not (u' 言情小说_打造最新原创' in unclearContent or unclearContent == 'None'): upload2Bucket(str(cid) + '.json', json.dumps(capObj)) else: try: if not capUrl or len(capUrl) < 1: print cid, 'no url, bookId : ', bookId else: if 'shuqireader' in capUrl: content = getContentByUrl(capUrl) # updateContentById(cid, content) else: content, host = getAndParse(capUrl) if not content: print cid, ' getAndparse content failed, bookId : ', bookId # continue # updateContentById(cid, content) # cap[4] = content capObj['content'] = content upload2Bucket(str(cid) + '.json', json.dumps(capObj)) except Exception as e: print 'cid ', cid, 'error: ', e except ValueError as er: print 'cid ', cid, 'error: ', er