def cleanSubtitle():
    conn, csor = getDushuConnCsor()
    dictCsor = conn.cursor(MySQLdb.cursors.DictCursor)
    bookId = 2584584
    carry = 50000
    while bookId < 2590000:
        try:
            dictCsor.execute(
                'select id,subtitle  from ' + db_dushu +
                " where id >= %s and id <= %s and subtitle REGEXP '[0-9]{5,20}'",
                (bookId, bookId + carry))
            conn.commit()

            books = dictCsor.fetchallDict()
            for book in books:
                newSubtitle = subTitleClean(book['subtitle'])
                if not newSubtitle == book['subtitle'].encode('utf-8'):
                    myLogging.info('bookId %s update from %s to %s',
                                   book['id'],
                                   book['subtitle'].encode('utf-8'),
                                   newSubtitle)
                    updateOneFieldByOneField('subtitle', newSubtitle, 'id',
                                             book['id'])

        except Exception as e:
            myLogging.warning(e)
        bookId += carry
    chapObj = dictCsor.fetchoneDict()

    csor.close()
    conn.close()
Exemple #2
0
def updateFromMysql(st=10000, end=7000000):
    '''
        永远运行,从数据库中查询出于连载状态的小说,进行更新
    '''

    idx = st
    carry = 10000
    myLogging.info('start from %s to %s ', st, end)

    while idx < end:
        # seq = range(5000, 6000)
        seq = range(idx, idx + carry)

        random.shuffle(seq)
        #
        for sqBid in seq:
            # print sqBid
            # if sqBid in nullIdSet:
            #     continue
            if not srcIdBloom.contains('shuqi' + str(sqBid)):
                try:
                    num = start(sqBid, allowUpdate=False)
                    if num and num > 0:
                        srcIdBloom.add('shuqi' + str(sqBid))
                    # start(17043)
                except Exception as e:
                    myLogging.error('shuqi sid: %s , has exception %s',
                                    str(sqBid), traceback.format_exc())
                except IOError as e2:
                    myLogging.error('shuqi sid: %s , has exception %s',
                                    str(sqBid), traceback.format_exc())

        idx = idx + carry
def updateByBookObj(bookObj):
    source = bookObj['source']
    [zid, zBocId] = source.split('/')
    currentChapsObj = getChapsByBocId(zBocId)
    if not currentChapsObj or not currentChapsObj.has_key('chapters') or len(
            currentChapsObj['chapters']) < 1:
        # delBookById(bookObj['id'])
        myLogging.error(
            'zssq  book maybe have been droped, plz consider to delete id: ' +
            str(bookObj['id']) + ' sid: ' + str(source))
        return
    currentChapNum = len(currentChapsObj['chapters'])
    if currentChapNum > bookObj['chapterNum']:

        newIdx = handlChapsByBookObjZidBocId(bookObj,
                                             zid,
                                             currentChapsObj,
                                             allowUpdate=True)

        if newIdx >= bookObj['chapterNum']:  #newIdx下标从1开始的
            updateOneFieldByOneField('chapterNum', newIdx + 1, 'id',
                                     bookObj['id'])
            updateBoostWithUpdateTime(bookObj['id'])
            myLogging.info('zid: %s, bookId: %s  update %s chaps ', zid,
                           bookObj['id'],
                           str(newIdx + 1 - bookObj['chapterNum']))

    else:
        myLogging.info('zid: %s, bookId: %s no update ()', zid, bookObj['id'])
def searchAndCrawl(searchInput, limit=5):

    searchResObj = search(searchInput)
    succcount = 0
    count = 0
    for bookObj in searchResObj['books']:
        count += 1
        if count > 5:  #只要搜索结果的前N个,后面的就算了
            break

        digest = getBookDigest(bookObj)
        if bookDigestBloom.contains(digest):
            myLogging.info('has book %s, with same author %s, skip',
                           bookObj['title'].encode('utf-8'),
                           bookObj['author'].encode('utf-8'))
            continue
        zid = bookObj['_id']
        try:
            startByZid(zid, allowUpdate=False)
        except Exception as e:
            myLogging.error('zid %s has exception: %s', zid,
                            traceback.format_exc())
        succcount += 1
        if succcount > limit:  #最多抓取图书数量
            break
Exemple #5
0
def dealUIDsBySoup(conn, csor, pageCount, pageSoup, prov):
    uidList = pageSoup.select('.list-group-item')
    if len(uidList) < 1:
        myLogging.error('no com list, skip %s page: %s', prov, pageCount)
        return
        # continue
    for uidTag in uidList:
        try:
            if not uidTag.has_attr('href'):
                myLogging.error('no com Tag, skip %s page: %s; tag: %s', prov,
                                pageCount, uidTag)
                # continue
                return
            prv = None
            uid = uidTag['href'].replace('firm_',
                                         '').replace('.shtml',
                                                     '').replace('/', '')
            if '_' in uid:
                strs = uid.split('_')
                prv = strs[0]
                uid = strs[1]
            if uid in idBloom:
                myLogging.info('already crawled, skip uid: %s', uid)
                continue
            insertWithUid(conn, csor, prv, uid)
        except Exception as ee:
            myLogging.error('uid: %s error: %s', uid, ee)
Exemple #6
0
def indexBookSuggest(st=218289):
    myLogging.info('st: %s', st)

    conn2, csor2 = getDushuConnCsor()

    csor2.execute(
        "select id,title,author from cn_dushu_book where id >= %s and operateStatus = 0 ",
        (st, ))
    conn2.commit()
    results = csor2.fetchall()
    baseUrl = DUSHU_SUGGEST_URL
    for book in results:
        id = book[0]
        title = book[1]
        author = book[2]
        # tags = book[3]

        bookObj = dict()
        sinput = []
        sinput.append(title)
        sinput.append(author)
        # if tags:
        #     ts = json.loads(tags)
        #     for t in ts:
        #         sinput.append(t)
        inputBoj = dict()
        inputBoj['input'] = sinput
        inputBoj['output'] = title + "(" + author + ')'
        bookObj['testsuggest'] = inputBoj
        try:
            r = requests.put(baseUrl + str(id), data=json.dumps(bookObj))
            print r.text
        except Exception as e:
            print bookObj, e
Exemple #7
0
def fixNewLineByBookObjs(quanBenObjs):

    from parse.contentHelper import textClean
    for quanBenObj in quanBenObjs:
        bookId = quanBenObj['id']
        chapIds = getCapIdsByBookId(bookId)
        for chapId in chapIds:
            try:
                url = ossBaseUrl + str(chapId) + '.json'
                r = requests.get(url)

                obj = json.loads(r.text)

                if not obj or not obj.has_key('content'):
                    delCapById(chapId)
                    myLogging.info('chap id %s, has no oss obj, delete',
                                   chapId)
                    continue

                content = textClean(obj['content'])
                obj['content'] = content

                uploadJson2Bucket(str(chapId) + '.json', json.dumps(obj))
                myLogging.info('succ cid %s', chapId)
            except Exception as e:
                myLogging.error('chap id %s, with exception: %s', chapId,
                                traceback.format_exc())
def insertCapWithCapObj(capObj, conn2=None, csor2=None, allowUpdate=False):
    if not conn2 or not csor2:
        conn2, csor2 = getDushuConnCsor()

    # sql = "insert ignore cn_dushu_acticle (title,rawUrl,source,content,bookId,idx,digest,size,bookUUID) values" \
    #       "('%s','%s','%s','%s',%d,%d,'%s', %d, '%s')" % (
    #           capObj['title'], capObj['rawUrl'], capObj['source'], capObj['content']
    #           , capObj['bookId'], capObj['idx'], capObj['digest'], capObj['size'], capObj['bookUUID'])
    try:
        csor2.execute("insert cn_dushu_acticle (bookId,idx,digest,bookUUID,title,size) values" \
          "(%s,%s,%s,%s,%s,%s)" , (capObj['bookId'], capObj['idx'], capObj['digest'], capObj['bookUUID'], capObj['title'], capObj['size']))
        # csor2.execute("update cn_dushu_acticle set title = %s, size= %s where digest = %s" , (capObj['title'], capObj['size'], capObj['digest'] ))
        conn2.commit()
        myLogging.info('scap, ' + ":" + str(capObj['idx']))
        # , ', content: ', capObj['content'][0:15]

    except Exception as e:
        #     # 发生错误时回滚
        myLogging.error(e)
        if conn2:
            try:
                conn2.rollback()
            except Exception as ee:
                myLogging.error(ee)
        if not allowUpdate:
            return None
    try:
        csor2.execute(
            "select id,bookId from cn_dushu_acticle where digest = %s;",
            (capObj['digest'], ))
        conn2.commit()

        sqlObj = csor2.fetchone()
        capId = sqlObj[0]
        bookId = sqlObj[1]

        if bookId != capObj['bookId']:
            myLogging.info('update bookId' + str(capId))
            # 如果已存在,且bookId不对,更新下,防止错误cap占坑
            csor2.execute(
                "update cn_dushu_acticle set bookId = %s where id = %s;",
                (capObj['bookId'], capId))
            conn2.commit()

        capObj['id'] = capId
        return capId
    except Exception as e:
        #     # 发生错误时回滚
        myLogging.error(e)
        if conn2:
            try:
                conn2.rollback()
            except Exception as ee:
                myLogging.error(ee)
        return None

    csor2.close()
    conn2.close()
def getBookObj(allowUpdate, mid):
    befBookObj = time.time()
    bookObj, count = crawlCurrentBookObj(mid)
    aftBookObj = time.time()

    bookObj = insertBookWithConn(bookObj, allowUpdate)
    # aftInsertBookObj = time.time()
    myLogging.info('crawl book spent' + str(aftBookObj - befBookObj) + ' secs; insert spent ' + str(time.time() - aftBookObj))
    return bookObj, count
Exemple #10
0
def getQichachaInvestDigests():
    idbloom = getBloom()
    conn, csor = getComConnCsor()
    csor.execute('select uid from com_invest')
    ids = csor.fetchall()
    [idbloom.add(mid[0]) for mid in ids]
    # if ids[0][0] in idbloom:
    myLogging.info('load exists ids ok')

    return idbloom
Exemple #11
0
def qichachaFromProvs(provs):
    myLogging.info('start: provs %s', str(provs))
    catBaseIrl = 'http://www.qichacha.com/gongsi_area_prov_'
    conn, csor = getComConnCsor()
    for prov in provs:
        pageBaseUrl = catBaseIrl + prov + '_p_'
        for pageCount in range(1, 501):
            pageUrl = pageBaseUrl + str(pageCount) + '.shtml'
            try:
                pageContent = getQichachaHtml(pageUrl)
                pageSoup = getSoupByStrEncode(pageContent, 'utf-8')
                dealUIDsBySoup(conn, csor, pageCount, pageSoup, prov)
            except Exception as ee:
                myLogging.error('page ' + str(pageCount) + ' error %s', ee)
def dailyLatestUpdate():

    nanShengCategorys = (9, 18, 41, 50, 58, 69, 74, 82, 90, 93, 97, 111)
    nvShengCategorys = (1, 18, 26, 33, 50, 64, 97, 111)

    nanBooks = getLatestUpdateBooks(nanShengCategorys, limit=50)

    updateIdsByBooks(nanBooks, 'girlbest')
    myLogging.info('update boy latest ids to %s', nanBooks)

    nvBooks = getLatestUpdateBooks(nvShengCategorys, limit=50)

    updateIdsByBooks(nvBooks, 'boylastest')
    myLogging.info('update girl latest ids to %s', nvBooks)
Exemple #13
0
def insertWithUid(conn2, csor2, prv, uid):

    if uid in idBloom:
        print 'already crawled uid:', uid
        return

    # idBloom.add(uid)

    global conn, csor
    if not conn or (not csor):
        conn2, csor2 = getComConnCsor()

    com_base_info_str = getBaseInfoById(prv, uid)
    com_base_info_json = json.loads(com_base_info_str)
    if com_base_info_json['status'] != 1:
        print 'json int not succ , uid: ', uid, ' content:', com_base_info_str
        return
    data = com_base_info_json['data']['Company']
    companyType = data['EconKind']
    # webName = data['webName']
    companyName = data['Name']
    liscense = data['No']
    if not liscense:
        liscense = data['OrgNo']
    examineDate = ''
    if data['CheckDate']:
        examineDate = data['CheckDate'].strip()
        # webSite = ','.join(data['webSite'])
        # sql = """insert ignore into com_base (id,companyName,companyType,examineDate,liscense,source,webSite,webName) values (%s,%s,%s,%s,%s,%s,%s,%s);""" % (str(id), companyName, companyType,examineDate, liscense, "tianyacha",webSite,webName)

    global staticInsertTotolCount, staticInsertTotolTime, staticInsertCarry
    startTime = time.time()

    try:
        csor2.execute(
            """insert ignore into com_base_copy (id,companyName,companyType,examineDate,liscense,source,src_content)
            values (%s,%s,%s,%s,%s,%s,%s);""",
            (uid, companyName, companyType, examineDate, liscense, "qichacha",
             com_base_info_str))
        conn2.commit()
        myLogging.info('comOk, uid: %s, comName: %s', uid,
                       unicode(companyName).encode('utf-8'))
        endTime = time.time()
        thisSpentTime = endTime - startTime

        statisMysqlInsert(staticInsertCarry, thisSpentTime)

    except Exception as e:
        myLogging.error('insert error, uid: %s, error:%s', uid, e)
def insertBookWithConn(bookObj, allowUpdate=True, conn2=None, csor2=None):

    if not conn2 or not csor2:
        conn2, csor2 = getDushuConnCsor()

    userId = random.randint(1, 50)

    updateTime = int(time.time())

    digest = getBookDigest(bookObj)
    bookObj['digest'] = digest

    #统一清理操作
    bookObj['subtitle'] = subTitleClean(bookObj['subtitle'])

    if not bookObj.has_key('source'):
        bookObj['source'] = ''

    try:
        csor2.execute('insert  ' + db_dushu +
          '(categoryCode,typeCode,category,type,userId,title,subtitle,imgUrl,author,updateTime' \
          ",rawUrl,source,digest,status,viewNum, chapterNum, bookType, size) values" \
          "(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s, %s)" \
          , (bookObj['categoryCode'],bookObj['typeCode'], bookObj['category'], bookObj['type'], userId,bookObj['title']
             ,bookObj['subtitle'],bookObj['imgUrl'],bookObj['author'],updateTime, bookObj['rawUrl']
             ,bookObj['source'],digest, 11,bookObj['viewNum'],bookObj['chapterNum'],bookObj['bookType'],bookObj['size']))
        # csorDoc.execute('update cn_dushu_book set subtitle = %s where digest = %s'
        #   , (bookObj['subtitle'],digest))
        conn2.commit()
        myLogging.info('succ book, ' +
                       unicode(bookObj['title']).encode('utf-8'))
    except Exception, e:
        #     # 发生错误时回滚
        myLogging.warning('update rollback; maybe exists, err:  %s',
                          traceback.format_exc())
        if conn2:
            try:
                conn2.rollback()
            except Exception as ee:
                myLogging.error('rollback error : ' + bookObj['rawUrl'])

        if u'完结' == bookObj['bookType']:
            updateBookTypeByRawUrl(bookObj['bookType'], bookObj['rawUrl'])
            # return None #有bug
        if not allowUpdate:
            return None
Exemple #15
0
def crawlByDailySearchHistory(timeStart=None):
    myLogging.info('timeStart: %s', timeStart)
    baseUrl = 'http://%s/log/_search' % SEARCHHOST
    if not timeStart:
        timeStart = int(time.time() * 1000) - 24 * 3600 * 1000
    searchInput = '''
    {
"size":0,
"query": {
	"bool":{
		"must":[{
		    "range" : {
		        "page" : {
		            "gte" : 1
		        }
	    	}
	    },
	    {
		    "range" : {
		        "timestamp" : {
		            "gte" : %s
		        }
		    }
		   }
    	]
    }
 },
 "aggs":{
 "hist": {
      "terms": {
        "field": "word.raw",
        "size": 1000,
        "order": {
          "_count": "desc"
        }
      }
    }
 }
 }
    ''' % (str(timeStart))
    r = requests.post(baseUrl, data=searchInput)
    resObj = json.loads(r.text)
    for wordObj in resObj['aggregations']['hist']['buckets']:
        word = wordObj['key']
        searchAndCrawl(word)
def updateContentById(id, content):

    conn, csor = getDushuConnCsor()

    # sql = "update cn_dushu_acticle set content = %s where id = %s " % (content, str(id))
    try:
        csor.execute("update cn_dushu_acticle set content = %s where id = %s ",
                     (content, id))
        conn.commit()
        myLogging.info(str(id) + ' succ cap, ' + content[0:15])
    except Exception as e:
        #     # 发生错误时回滚
        myLogging.error(e)
        if conn:
            try:
                conn.rollback()
            except Exception as ee:
                myLogging.error(ee)

    csor.close()
    conn.close()
def mianfeiUpdateByBookObj(bookObj, maxChapNum=0):
    mid = bookObj['source']
    newBookObj, newChapNum = crawlCurrentBookObj(mid)
    if not newBookObj:
        myLogging.error(
            'mid %s with dbId %s get None currentBookObj, plz check', mid,
            bookObj['id'])
        return
    latestCapIndex = newBookObj['latestCapIndex']
    newChapNum = max(newChapNum, latestCapIndex, maxChapNum)
    if newChapNum >= bookObj['chapterNum']:
        resIdx = handleCapsByBookObj(allowUpdate=True,
                                     bookObj=bookObj,
                                     count=newChapNum,
                                     mid=mid,
                                     startCapIdx=bookObj['chapterNum'])
        if resIdx > bookObj['chapterNum']:
            updateOneFieldByOneField('chapterNum', resIdx, 'id', bookObj['id'])
            updateBoostWithUpdateTime(bookObj['id'])
            myLogging.info( newBookObj['title'].encode('utf-8') + ' update ' \
                  + str(resIdx - bookObj['chapterNum']) + ' chaps (mianTxt) ')
            if u'连载' != newBookObj['bookType']:
                updateOneFieldByOneField('bookType', newBookObj['bookType'],
                                         'id', bookObj['id'])
                myLogging.info(newBookObj['title'].encode('utf-8') +
                               newBookObj['bookType'])
    else:
        myLogging.info(newBookObj['title'].encode('utf-8') +
                       ' no update (mianTxt)')
Exemple #18
0
def updateByBookObj(bookObj):
    source = int(bookObj['source'].replace('shuqi', ''))
    newBookObj, digest = getBookObjFromSQid(source)
    if not newBookObj:
        # delBookById(bookObj['id'])
        myLogging.error(
            'shuqi book has been droped, plz consider to delete id: ' +
            str(bookObj['id']) + ' sid: ' + str(source))
        return
    if newBookObj['chapterNum'] > bookObj['chapterNum']:
        newBookObj['id'] = bookObj['id']
        newChapNum = crawlCapsWithBookObj(bookObj=newBookObj,
                                          bookId=source,
                                          allowUpdate=True)

        if newChapNum >= bookObj['chapterNum']:
            updateOneFieldByOneField('chapterNum', newChapNum, 'id',
                                     bookObj['id'])
            updateBoostWithUpdateTime(bookObj['id'])
            myLogging.info( newBookObj['title'].encode('utf-8') + ' update ' + str(newChapNum - bookObj['chapterNum'])\
                  + ' chaps ')

            if u'连载' != newBookObj['bookType']:
                updateOneFieldByOneField('bookType', newBookObj['bookType'],
                                         'id', bookObj['id'])
                myLogging.warning(newBookObj['title'].encode('utf-8') +
                                  newBookObj['bookType'].encode('utf-8'))
        else:
            myLogging.info(newBookObj['title'].encode('utf-8') +
                           ' has unexcepted, please check. didnot update ')
    else:
        myLogging.info(newBookObj['title'].encode('utf-8') + ' no update ()')
def handleChapsByBookObj(bookObj, zid, allowUpdate=False):

    # zid = bookObj['source']

    bocObjs = getBocObjsByZid(zid)

    sourceCount = 0
    for bocIdx in range(0, len(bocObjs)):
        bocObj = bocObjs[bocIdx]
        bocId = bocObj['_id']

        try:

            bocSource = bocObj['source']
            if 'zhuishuvip' == bocSource:
                continue

            bookObj['source'] = zid + '/' + bocId
            bookObj['rawUrl'] = ZSSQBOOKINFOBASEURL + str(
                zid) + "?source=" + str(bocId)
            chapListObj = getChapsByBocId(bocId)
            bookObj['chapterNum'] = min(bookObj['chapterNum'],
                                        len(chapListObj['chapters']))

            if bookObj['chapterNum'] <= MINCHAPNUM:
                continue

            bookObj = insertBookWithConn(bookObj, allowUpdate)

            resInx = handlChapsByBookObjZidBocId(bookObj, zid, chapListObj,
                                                 allowUpdate)
            if resInx <= MINCHAPNUM:
                myLogging.info(
                    'zid %s dbid %s crawl too small chapNum, delete ', zid,
                    bookObj['id'])
                delBookById(bookObj['id'])

            sourceCount += 1
            if sourceCount >= sourceLimit:
                myLogging.info('zid: %s crawl source to sourceLimit', zid)
                break
            else:
                # bookObj['rawUrl'] = ZSSQBOOKINFOBASEURL + str(zid) + "?source=" + str(bocId)
                # bookObj = parseInsertBook(allowUpdate, bookObj, zid) #重新插入另外一个源的书
                myLogging.info('zid: %s crawl another source %s', zid, bocId)
        except Exception as e:
            myLogging.error('zid: %s ,bocId %s get exception ', zid, bocId)
            myLogging.error(traceback.format_exc())
Exemple #20
0
def getQichachaDigests():
    idbloom = loadBloomFromFile('local/qichachaUIDs')
    if idbloom:
        myLogging.info('load bloom from file succ, no need load from db')
        # return idbloom
    else:
        myLogging.info('no dump bloom file,  load from db')
        idbloom = getBloom(2000 * 10000)
        # idbloom = getBloom()
        conn, csor = getComConnCsor()
        csor.execute('select id from com_base_copy')
        # csor.execute('select id from com_base_copy limit 10')
        ids = csor.fetchall()
        [idbloom.add(mid[0]) for mid in ids]
        # if ids[0][0] in idbloom:
        myLogging.info('load exists ids ok, generate dump bloom file')
        dumpBloomToFile(idbloom, fileName='local/qichachaUIDs')
    return idbloom
 def output(self):
     myLogging.info('mianfeiTXT output')
Exemple #22
0
#!/usr/bin/python
# -*- coding: UTF-8 -*-
'''

@author: zyq
'''
import sys

# from app.mianfeiTXTNewFilder import findByIdRange
import time

from app.shuqi import start
from app.shuqiNewFilder import updateFromMysql
from local.hotConfigHelper import getHotConfigDict
from util.logHelper import myLogging

if __name__ == '__main__':
    # start('10650', allowUpdate=False)

    st = 50000
    end = 500000
    if len(sys.argv) > 1:
        st = int(sys.argv[1])
        end = int(sys.argv[2])

    updateFromMysql(st, end)
    sleepTime = getHotConfigDict()['shuqiNewFinder']['updateSleep']
    myLogging.info(' done one loop, now sleep ' + str(sleepTime) + ' secs')
    time.sleep(int(sleepTime))
Exemple #23
0
#!/usr/bin/python
# -*- coding: UTF-8 -*-
'''

@author: zyq
'''
import time

from app.mianfeiTXTUpdater import mianfeiTxtUpdateFromMysql
from local.hotConfigHelper import getHotConfigDict
from util.logHelper import myLogging

if __name__ == '__main__':
    while 1:
        myLogging.info('begin mianfeiTXT updater')
        mianfeiTxtUpdateFromMysql()
        sleepTime = getHotConfigDict()['mianFeiTXTUpdater']['updateSleep']
        myLogging.info(' done one loop, now sleep ' + str(sleepTime) + ' secs')
        time.sleep(int(sleepTime))
Exemple #24
0
#!/usr/bin/python
# -*- coding: UTF-8 -*-
'''

@author: zyq
'''
import time

from app.zssqUpdater import updateFromMysql
from local.hotConfigHelper import getHotConfigDict
from util.logHelper import myLogging
# import logging
# logging.basicConfig(format='%(asctime)s %(message)s', datefmt='%m/%d/%Y %I:%M:%S %p',level=logging.INFO)
if __name__ == '__main__':
    while 1:
        myLogging.info('begin zssq updater')
        updateFromMysql()
        sleepTime = getHotConfigDict()['zssqUpdater']['updateSleep']
        myLogging.info(' done one loop, now sleep ' + str(sleepTime) + ' secs')
        time.sleep(int(sleepTime))
Exemple #25
0
 def output(self):
     myLogging.info('shuqi output')
 def crawl(self):
     myLogging.info('mianfeiTXT crawl')
     handleByMTID(self.mid)
Exemple #27
0
    def crawl(self):
        myLogging.info('shuqi init')

        start(self.sid)
def handleCapsByBookObj(allowUpdate, bookObj, count, mid, startCapIdx = 1):
    capIdxs = set()
    if allowUpdate:
        capIdxs = getCapIdxsByBookId(bookObj['id'])  # 已在库中的章节下标

    # myBookId = bookObj['id']
    #
    # startCap = time.time()
    crawlParseSpent = 0
    insertCap = 0
    uploadCap = 0
    succCapTimes = 1
    resIdx = startCapIdx
    for cid in range(0, count + 1):
        try:

            if allowUpdate:
                if cid in capIdxs:
                    continue  # 该章节已在库中,跳过
                # else:
                #     startCap = time.time()

            befCrawl = time.time()
            succCapTimes = succCapTimes + 1

            # capContentUrl = MianFeiContentBaseUrl + str(cid) + '&contentid=' + str(mid)
            capContentUrl = MianFeiTXTChapBaseUrl + '?' + paramMap().mianfeiTXT().mBookId(bookObj['source']).mChapId(
                cid).mianfeiTXTSign().toUrl()

            capContent = getContentWithUA(capContentUrl, ua)
            if not capContent:
                capContent = getContentWithUA(capContentUrl, ua)
            # capContent = capContent.replace(r'\r', '').replace(r'\n', '')
            capListJsonObj = json.loads(capContent, strict=False)
            if not (capListJsonObj['returnCode'] == '0000'):
                capListJsonObj = json.loads(capContent)
                if not (capListJsonObj['returnCode'] == '0000' and capListJsonObj['returnMsg'] == u'成功'):
                    resIdx = min(cid, resIdx)
                    myLogging.info('chap content null ,RETURN, capId:' + str(cid) + ' mid: ' + str(mid))
                    return resIdx  # 原api接口更新不及时,为了配合后来的 无限向前重试方法,在这跳出

            capObj = dict()
            orgContent = capListJsonObj['data']['bookChapter']['content']
            contentSoup = getSoupByStr(orgContent)
            if not contentSoup or '' == orgContent or len(orgContent) < 1:
                myLogging.error('chap content null ,RETURN, capId:' + str(cid) + ' mid: ' + str(mid))
                resIdx = min(cid, resIdx)
                return resIdx #原api接口更新不及时,为了配合后来的 无限向前重试方法,在这跳出

            if contentSoup.body['style']:
                del contentSoup.body['style']
            content = unicode(contentSoup.body).replace(u'<body>', '').replace(u'</body>', '').replace(u'\n\n',
                                                                                                       u'\n').replace(
                u'<br><br>', u'<br>').replace(u'<br\><br\>', u'<br\>')
            capObj['content'] = textClean(content)
            capObj['title'] = unicode(contentSoup.title.get_text())
            capObj['rawUrl'] = capContentUrl
            # capObj['size'] = int(WordsCount)
            capObj['size'] = len(content)
            capObj['bookId'] = bookObj['id']
            capObj['source'] = bookObj['source']
            capObj['idx'] = cid
            capObj['bookUUID'] = bookObj['digest']

            digest = getCapDigest(bookObj, capObj, cid)

            capObj['digest'] = digest

            befInsertCap = time.time()
            crawlParseSpent = crawlParseSpent + (befInsertCap - befCrawl)

            capId = insertCapWithCapObj(capObj)

            aftInsertCap = time.time()
            insertCap = insertCap + (aftInsertCap - befInsertCap)

            if not capId:
                continue
            uploadJson2Bucket(str(capObj['id']) + '.json', json.dumps(capObj))

            aftUploadCap = time.time()
            uploadCap = uploadCap + (aftUploadCap - aftInsertCap)
            resIdx = max(cid, resIdx)
        except Exception as e:
            myLogging.error('crawl' + str(mid) + ' cap ' + str(cid) + ' exception: ' + str(e))
            resIdx = min(cid, resIdx)
    if succCapTimes > 1:
        succCapTimes = succCapTimes - 1
    myLogging.info( 'crawlParse avg: ' + str(float(crawlParseSpent) / float(succCapTimes)) + \
        ' insert avg: ' + str(float(insertCap) / float(succCapTimes)) + \
        ' upload avg: ' + str(float(uploadCap) / float(succCapTimes)))
    return resIdx
def changeSouceIds():
    bookObjs = getMianAllBookBaseObjs()
    for bookObj in bookObjs:
        try:
            foundNewId = False
            title = bookObj['title']
            author = bookObj['author']
            source = bookObj['source']
            bookId = bookObj['id']

            searchUrl = MianFeiTXTSearchBaseUrl + '?' + paramMap().mianfeiTXT()\
                .put('keyword', (title + author).encode('utf-8'))\
                .put('pageSize', '10').put('pageNum', '1').put('type', '1')\
                .mianfeiTXTSign() \
                .toUrl()

            # time.sleep(random.)
            r = requests.get(searchUrl)

            searchRes = json.loads(r.text)
            for resBook in searchRes['data']['books']:
                resTitle = resBook['name']
                if resTitle != title:
                    continue
                resAuthor = resBook['author']
                if resAuthor != author:
                    continue

                resId = resBook['id']

                if str(resId) == str(source):
                    myLogging.info('WTF: id no change?, bookId: %s, orgSoueceId: %s,  newId: %s', bookId, source, resId)

                latestChapObj = getLatestChapByBookId(bookId)
                if not latestChapObj:
                    myLogging.error('no chaps in db yet, bookId: %s, new mid: %s', bookId, resId)
                    updateOneFieldByOneField('source', resId, 'id', bookId)
                    foundNewId = True
                    break

                cid = latestChapObj['idx']
                chapTitle = latestChapObj['title']

                capContentUrl = MianFeiTXTChapBaseUrl + '?' + paramMap().mianfeiTXT().mBookId(resId).mChapId(
                    cid).mianfeiTXTSign().toUrl()

                capContent = getContentWithUA(capContentUrl)
                if not capContent:
                    capContent = getContentWithUA(capContentUrl)
                # capContent = capContent.replace(r'\r', '').replace(r'\n', '')
                capListJsonObj = json.loads(capContent, strict=False)
                if not (capListJsonObj['returnCode'] == '0000'):
                    capListJsonObj = json.loads(capContent)
                    if not (capListJsonObj['returnCode'] == '0000' and capListJsonObj['returnMsg'] == u'成功'):
                        myLogging.error('get chap detail fail mid: %s, cid: %s', resId, cid)
                        continue

                chapterName = capListJsonObj['data']['bookChapter']['chapterName']
                if chapterName == chapTitle:
                    myLogging.info('bookId %s change source  from %s to %s', bookId, source, resId)
                    updateOneFieldByOneField('source', resId, 'id', bookId)
                    foundNewId = True
                    break
            if not foundNewId:
                myLogging.error('bookId %s did not find new id !!!,title: %s, author: %s, org source: %s', bookId, title, author,source )
        except Exception as e:
            myLogging.error(traceback.format_exc())
#!/usr/bin/python
# -*- coding: UTF-8 -*-
'''

@author: zyq
'''
import time
import traceback

from app.SearchHistoryCrawler import crawlByDailySearchHistory
from app.shuqiUpdater import updateFromMysql
from local.hotConfigHelper import getHotConfigDict
from util.logHelper import myLogging
# import logging
# logging.basicConfig(format='%(asctime)s %(message)s', datefmt='%m/%d/%Y %I:%M:%S %p',level=logging.INFO)
if __name__ == '__main__':
    timeStart = int(time.time() * 1000) - 24 * 3600 * 1000
    while 1:
        myLogging.info('begin searchHistoryCrawler')
        timeBeforeSearch = int(time.time() * 1000)
        try:
            crawlByDailySearchHistory(timeStart)
        except Exception as e:
            myLogging.error(traceback.format_exc())
        timeStart = timeBeforeSearch
        sleepTime = getHotConfigDict()['searchHistoryCrawler']['updateSleep']
        myLogging.info(' done one loop, now sleep ' + str(sleepTime) + ' secs')
        time.sleep(int(sleepTime))