def getLatestUpdateBooks(categorys, limit=30): ''' 按bookId和title获取章节信息对象 :param bookId: :param idx: :return: ''' conn, csor = getDushuConnCsor() dictCsor = conn.cursor(MySQLdb.cursors.DictCursor) try: dictCsor.execute( 'select id from ' + db_dushu + " where categoryCode in %s " "and imgUrl != 'http://tata-img.oss-cn-shanghai.aliyuncs.com/book-default.jpg' " " order by updateTime desc limit %s", (categorys, limit)) conn.commit() except Exception as e: myLogging.warning(e) chapObj = dictCsor.fetchallDict() csor.close() conn.close() return chapObj
def getBookByTitle(title): ''' 用title获取bookObj :return bookObjs即: [bookObj{"id":"1",,}]: ''' conn, csor = getDushuConnCsor() dictCsor = conn.cursor(MySQLdb.cursors.DictCursor) dictCsor.execute( "SELECT * from cn_dushu_book where rawUrl like" " 'http://api.yingyangcan.com.cn/interface/ajax/book/getbaseinfo.ajax?%' and title = '" + title + "';") conn.commit() results = dictCsor.fetchallDict() # if len(results) > 1: # raise InputException('more than one book') bookObj = results csor.close() conn.close() return bookObj
def cleanSubtitle(): conn, csor = getDushuConnCsor() dictCsor = conn.cursor(MySQLdb.cursors.DictCursor) bookId = 2584584 carry = 50000 while bookId < 2590000: try: dictCsor.execute( 'select id,subtitle from ' + db_dushu + " where id >= %s and id <= %s and subtitle REGEXP '[0-9]{5,20}'", (bookId, bookId + carry)) conn.commit() books = dictCsor.fetchallDict() for book in books: newSubtitle = subTitleClean(book['subtitle']) if not newSubtitle == book['subtitle'].encode('utf-8'): myLogging.info('bookId %s update from %s to %s', book['id'], book['subtitle'].encode('utf-8'), newSubtitle) updateOneFieldByOneField('subtitle', newSubtitle, 'id', book['id']) except Exception as e: myLogging.warning(e) bookId += carry chapObj = dictCsor.fetchoneDict() csor.close() conn.close()
def updateCapDigest(): conn2, csor2 = getDushuConnCsor() for i in range(1056363, 1722907): try: capObj = json.loads(bucket.get_object(str(i) + '.json').read()) except Exception as e: print i, e continue # for cap in caps: cid = capObj['id'] print cid bookDigest = capObj['bookUUID'] capTitle = capObj['title'] idx = capObj['idx'] m2 = hashlib.md5() forDigest = bookDigest + capTitle + u'#' + str(idx) m2.update(forDigest.encode('utf-8')) digest2 = m2.hexdigest() try: csor2.execute( "update cn_dushu_acticle set digest = %s where id = %s", (digest2, cid)) conn2.commit() except Exception as e: print cid, e csor2.close() conn2.close()
def indexBookSuggest(st=218289): myLogging.info('st: %s', st) conn2, csor2 = getDushuConnCsor() csor2.execute( "select id,title,author from cn_dushu_book where id >= %s and operateStatus = 0 ", (st, )) conn2.commit() results = csor2.fetchall() baseUrl = DUSHU_SUGGEST_URL for book in results: id = book[0] title = book[1] author = book[2] # tags = book[3] bookObj = dict() sinput = [] sinput.append(title) sinput.append(author) # if tags: # ts = json.loads(tags) # for t in ts: # sinput.append(t) inputBoj = dict() inputBoj['input'] = sinput inputBoj['output'] = title + "(" + author + ')' bookObj['testsuggest'] = inputBoj try: r = requests.put(baseUrl + str(id), data=json.dumps(bookObj)) print r.text except Exception as e: print bookObj, e
def insertCapWithCapObj(capObj, conn2=None, csor2=None, allowUpdate=False): if not conn2 or not csor2: conn2, csor2 = getDushuConnCsor() # sql = "insert ignore cn_dushu_acticle (title,rawUrl,source,content,bookId,idx,digest,size,bookUUID) values" \ # "('%s','%s','%s','%s',%d,%d,'%s', %d, '%s')" % ( # capObj['title'], capObj['rawUrl'], capObj['source'], capObj['content'] # , capObj['bookId'], capObj['idx'], capObj['digest'], capObj['size'], capObj['bookUUID']) try: csor2.execute("insert cn_dushu_acticle (bookId,idx,digest,bookUUID,title,size) values" \ "(%s,%s,%s,%s,%s,%s)" , (capObj['bookId'], capObj['idx'], capObj['digest'], capObj['bookUUID'], capObj['title'], capObj['size'])) # csor2.execute("update cn_dushu_acticle set title = %s, size= %s where digest = %s" , (capObj['title'], capObj['size'], capObj['digest'] )) conn2.commit() myLogging.info('scap, ' + ":" + str(capObj['idx'])) # , ', content: ', capObj['content'][0:15] except Exception as e: # # 发生错误时回滚 myLogging.error(e) if conn2: try: conn2.rollback() except Exception as ee: myLogging.error(ee) if not allowUpdate: return None try: csor2.execute( "select id,bookId from cn_dushu_acticle where digest = %s;", (capObj['digest'], )) conn2.commit() sqlObj = csor2.fetchone() capId = sqlObj[0] bookId = sqlObj[1] if bookId != capObj['bookId']: myLogging.info('update bookId' + str(capId)) # 如果已存在,且bookId不对,更新下,防止错误cap占坑 csor2.execute( "update cn_dushu_acticle set bookId = %s where id = %s;", (capObj['bookId'], capId)) conn2.commit() capObj['id'] = capId return capId except Exception as e: # # 发生错误时回滚 myLogging.error(e) if conn2: try: conn2.rollback() except Exception as ee: myLogging.error(ee) return None csor2.close() conn2.close()
def loadDid(self): conn, csor = getDushuConnCsor() csor.execute( "select digest from cn_dushu_book where operateStatus = 0;") conn.commit() ss = csor.fetchall() [self.ids.add(sid[0]) for sid in ss] csor.close() conn.close()
def getAll(): conn, csor = getDushuConnCsor() csor.execute( "select DATE_FORMAT(updateTime, '%Y-%m-%d') as day, keshou_count, keshou_area, keshou_zhuzai_count" ", keshou_zhuzai_area, new_publish_count, new_publish_area, new_publish_zhuzai_count" ", new_publish_zhuzai_area, sign_count, sign_area, sign_zhuzai_count, sign_zhuzai_area from cn_test " "order by id desc limit 100") conn.commit() res = csor.fetchall() return res
def fixUnuploadedCaps(): bookObjs = getShuqiAllBookObjs() conn, csor = getDushuConnCsor() for bookObj in bookObjs: csor.execute( 'select count(*) from ' + db_acticle + ' where bookId = %s ', (bookObj['id'], )) conn.commit() db_cap_count = csor.fetchone()[0] if db_cap_count <= bookObj['chapterNum']: continue csor.execute( 'select id from ' + db_acticle + ' where bookId = %s order by id desc', (bookObj['id'], )) conn.commit() cids = csor.fetchall() deleteCount = 0 for cidL in cids: cid = cidL[0] ossUrl = ossBaseUrl + str(cid) + '.json' r = requests.head(ossUrl) if r.status_code > 200: print 'bookId' + str(bookObj['id']) + ' cid: ' + str( cid) + ' status_code: ' + str(r.status_code) #从章节表中删除 delCapById(cid) deleteCount = deleteCount + 1 else: nowCapCount = len(cids) - deleteCount if bookObj['chapterNum'] <= nowCapCount: break if deleteCount > 0: #有删除 nowCapCount = len(cids) - deleteCount if bookObj['chapterNum'] == nowCapCount: continue #正好相等时两种情况:1,完结,应该没问题,暂不管;2,连载交给定时updater if bookObj['chapterNum'] < nowCapCount: #如果删除后章节还多,打日志,update print 'still more chapters, check bookId: ', str(bookObj['id']) # 删除后章节不够,update, elif bookObj['chapterNum'] > nowCapCount: bookObj['chapterNum'] = nowCapCount #update updateByBookObj(bookObj) csor.close() conn.close()
def getCapObjsById(bookId): conn,csor = getDushuConnCsor() dictCsor = conn.cursor(MySQLdb.cursors.DictCursor) dictCsor.execute("SELECT id,title,idx from " + db_acticle + " where bookId = %s and id < 63017738;", (bookId, )) conn.commit() capObjs = dictCsor.fetchallDict() csor.close() conn.close() return capObjs
def deleteNLastChaps(dbBookId, limit): ''' 删除最新的N个章节 :return: ''' conn, csor = getDushuConnCsor() csor.execute( 'delete from ' + db_acticle + " where bookId = %s order by id desc limit %s;", (dbBookId, limit)) conn.commit() csor.close() conn.close()
def updateCapFromTo(f, t): conn2, csor2 = getDushuConnCsor() print 'from', str(f), ' to ', str(t) offset = 100 begin = f end = begin + offset while end <= t: # sql = "select id, rawUrl,bookId,content from cn_dushu_acticle where id >= %d and id < %d" % (begin, end) try: csor2.execute( "select id, rawUrl,bookId,content from cn_dushu_acticle where id >= %d and id < %d", (begin, end)) conn2.commit() except Exception as e: # # 发生错误时回滚 print 'mysql ex: ', e begin = begin + offset end = end + offset results = csor2.fetchall() for cap in results: cid = cap[0] capUrl = cap[1] bookId = cap[2] unclearContent = cap[3] if not (u' 言情小说_打造最新原创' in unclearContent or unclearContent == 'None'): continue try: if not capUrl or len(capUrl) < 1: print 'no url, bookId : ', bookId if 'shuqireader' in capUrl: content = getContentByUrl(capUrl) # updateContentById(cid, content) else: content, host = getAndParse(capUrl) if not content: continue updateContentById(cid, content) except Exception as e: print 'cid ', cid, 'error: ', e except ValueError as er: print 'cid ', cid, 'error: ', er csor2.close() conn2.close()
def shuqiAddInit(): global gBookDict gBookDict = loadExistsSQId() conn2,csor2 = getDushuConnCsor() global donedegest donedegest = loadBloomFromFile(bloomDumpCapsName) if donedegest: print 'load bloom from file succ, no need load from db' return else: print 'load from db' donedegest = getBloom(2000 * 10000) csor2.execute("select id from cn_dushu_acticle order by id desc limit 1") conn2.commit() length = csor2.fetchone()[0] step = 0 carry = 500000 # while step < length - 1500000: while step < length : csor2.execute("select digest from cn_dushu_acticle where id > %s and id < %s", (step, step + carry)) conn2.commit() step = step + carry caps = csor2.fetchall() for cap in caps: digest = cap[0] # bookDigest = cap[1] # beg = time.time() # if not bookDigest in bookDict.keys(): # dictTook = time.time() # print 'dict took: ', dictTook - beg # continue # dictTook = time.time() # print 'dict took: ',dictTook - beg donedegest.add(digest) # blTook = time.time() # print 'bl took: ',blTook - dictTook # global gBookDict # gBookDict = bookDict dumpBloomToFile(donedegest, bloomDumpCapsName) csor2.close() conn2.close() return donedegest
def getIdsByType(confType): conn, csor = getDushuConnCsor() try: csor.execute("select ids from " + db_typeBook + " where type = %s", (confType, )) conn.commit() except Exception as e: myLogging.warning('get bookType exception: ' + str(e)) ids = csor.fetchone()[0] csor.close() conn.close() return ids
def updateOneFieldByOneField(upFieldName, upFieldValue, byFieldName, byFieldValue): conn, csor = getDushuConnCsor() try: csor.execute( "update " + db_dushu + " set " + upFieldName + " = %s, updateTime = " + str(int(time.time())) + " where " + byFieldName + " = %s", (upFieldValue, byFieldValue)) conn.commit() except Exception as e: myLogging.warning('update bookType exception: ' + str(e)) csor.close() conn.close()
def shuqiAddInitTmp(): conn2,csor2 = getDushuConnCsor() csor2.execute("select rawUrl from cn_dushu_book") conn2.commit() # bookDict = dict() res = csor2.fetchall() for book in res: source = book[0] global donedegest donedegest.add(source) csor2.close() conn2.close()
def updateBookTypeByRawUrl(type, rawUrl): conn, csor = getDushuConnCsor() try: csor.execute( "update " + db_dushu + " set bookType = %s where rawUrl = %s", ( type, rawUrl, )) conn.commit() except Exception as e: myLogging.warning('update bookType exception: ' + str(e)) csor.close() conn.close()
def getLatestChapByBookId(bookId): conn, csor = getDushuConnCsor() dictCsor = conn.cursor(MySQLdb.cursors.DictCursor) try: dictCsor.execute( "select * from " + db_acticle + " where bookId = %s order by id desc limit 1;", (bookId, )) conn.commit() except Exception as e: myLogging.warning('getLatestChapByBookId exception: ' + str(e)) bookObj = dictCsor.fetchoneDict() csor.close() conn.close() return bookObj
def getChapTitlesByBookId(bookId): conn, csor = getDushuConnCsor() titles = set() csor.execute('select title from ' + db_acticle + " where bookId = %s", (bookId, )) conn.commit() results = csor.fetchall() for capObj in results: titles.add(capObj[0]) csor.close() conn.close() return titles
def loadDid(self): conn, csor = getDushuConnCsor() csor.execute("select source from cn_dushu_book;") conn.commit() ss = csor.fetchall() [self.ids.add(sid[0]) for sid in ss] csor.execute("select sid from shuqi_deleted_ids;") conn.commit() ss = csor.fetchall() [self.ids.add('shuqi' + str(sid[0])) for sid in ss] csor.close() conn.close()
def getFieldByBookId(field, bookId): conn, csor = getDushuConnCsor() idxs = set() csor.execute( 'select ' + field + ' from ' + db_acticle + " where bookId = %s", (bookId, )) conn.commit() results = csor.fetchall() for capObj in results: idxs.add(capObj[0]) csor.close() conn.close() return idxs
def insertBookWithConn(bookObj, allowUpdate=True, conn2=None, csor2=None): if not conn2 or not csor2: conn2, csor2 = getDushuConnCsor() userId = random.randint(1, 50) updateTime = int(time.time()) digest = getBookDigest(bookObj) bookObj['digest'] = digest #统一清理操作 bookObj['subtitle'] = subTitleClean(bookObj['subtitle']) if not bookObj.has_key('source'): bookObj['source'] = '' try: csor2.execute('insert ' + db_dushu + '(categoryCode,typeCode,category,type,userId,title,subtitle,imgUrl,author,updateTime' \ ",rawUrl,source,digest,status,viewNum, chapterNum, bookType, size) values" \ "(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s, %s)" \ , (bookObj['categoryCode'],bookObj['typeCode'], bookObj['category'], bookObj['type'], userId,bookObj['title'] ,bookObj['subtitle'],bookObj['imgUrl'],bookObj['author'],updateTime, bookObj['rawUrl'] ,bookObj['source'],digest, 11,bookObj['viewNum'],bookObj['chapterNum'],bookObj['bookType'],bookObj['size'])) # csorDoc.execute('update cn_dushu_book set subtitle = %s where digest = %s' # , (bookObj['subtitle'],digest)) conn2.commit() myLogging.info('succ book, ' + unicode(bookObj['title']).encode('utf-8')) except Exception, e: # # 发生错误时回滚 myLogging.warning('update rollback; maybe exists, err: %s', traceback.format_exc()) if conn2: try: conn2.rollback() except Exception as ee: myLogging.error('rollback error : ' + bookObj['rawUrl']) if u'完结' == bookObj['bookType']: updateBookTypeByRawUrl(bookObj['bookType'], bookObj['rawUrl']) # return None #有bug if not allowUpdate: return None
def getBookCount(): ''' 获取图书总数 :param dbid: :return: ''' conn, csor = getDushuConnCsor() try: csor.execute("select count(*) from " + db_dushu) conn.commit() except Exception as e: myLogging.warning('update bookType exception: ' + str(e)) count = csor.fetchone()[0] csor.close() conn.close() return count
def delCapById(cid): conn2, csor2 = getDushuConnCsor() try: csor2.execute("delete from " + db_acticle + " where id = %s", (cid, )) conn2.commit() except Exception as e: # # 发生错误时回滚 myLogging.error('mysql ex: ' + str(e)) if conn2: try: conn2.rollback() except Exception as ee: myLogging.error('rollback error : ' + str(cid)) csor2.close() conn2.close()
def loadExistsSQId(): conn2, csor2 = getDushuConnCsor() bloom = getBloom(200000) csor2.execute( "select source from cn_dushu_book where source like 'shuqi%' and id < 127400;" ) #id > %s and id < %s", (step, step + carry)) conn2.commit() caps = csor2.fetchall() for s in caps: bloom.add(s) dumpBloomToFile(bloom, 'local/BooksBloomDump') csor2.close() conn2.close() return bloom
def getZssqAllBookObjs(): ''' 获取所有追书神器的主键和相关信息:id,rawUrl,chapterNum,source,digest :return bookObjs即: [bookObj{"id":"1",,}]: ''' conn, csor = getDushuConnCsor() dictCsor = conn.cursor(MySQLdb.cursors.DictCursor) dictCsor.execute("SELECT id from cn_dushu_book where " " rawUrl like 'http://api.zhuishushenqi.com/book/%';") conn.commit() bookObjs = dictCsor.fetchallDict() csor.close() conn.close() return bookObjs
def getShuqiIdRawUrlAsBookObjs(): ''' 获取所有Shuqi的主键和相关信息:id,rawUrl,chapterNum,source,digest :return bookObjs即: [bookObj{"id":"1",,}]: ''' conn, csor = getDushuConnCsor() dictCsor = conn.cursor(MySQLdb.cursors.DictCursor) dictCsor.execute("SELECT id,rawUrl from cn_dushu_book where operateStatus = 0 " " and rawUrl like 'http://api.shuqireader.com/reader/bc_cover.php%';") conn.commit() bookObjs = dictCsor.fetchallDict() csor.close() conn.close() return bookObjs
def updateIdsByType(confType, ids): conn, csor = getDushuConnCsor() try: csor.execute( "update " + db_typeBook + ' set ids = %s where type = %s', (ids, confType)) conn.commit() except Exception as e: myLogging.warning('update bookType exception: ' + str(e)) csor.close() conn.close() # return ids # if __name__ == '__main__': # delBookById(227921)
def getBookObjById(dbid): ''' 更加库中主键id获取book对象 :param dbid: :return: ''' conn, csor = getDushuConnCsor() dictCsor = conn.cursor(MySQLdb.cursors.DictCursor) try: dictCsor.execute("select * from " + db_dushu + " where id = %s", (dbid, )) conn.commit() except Exception as e: myLogging.warning('update bookType exception: ' + str(e)) bookObj = dictCsor.fetchoneDict() csor.close() conn.close() return bookObj
def deleteChapsLargerThanIdx(bookId, idx): ''' 删除章节表中所有大于此idx的 :param bookId: :param idx: :return: ''' conn, csor = getDushuConnCsor() try: csor.execute( 'delete from ' + db_acticle + " where bookId = %s and idx > %s", (bookId, idx)) conn.commit() except Exception as e: myLogging.warning(e) csor.close() conn.close()