Example #1
0
def updateByBookObj(bookObj):
    source = int(bookObj['source'].replace('shuqi', ''))
    newBookObj, digest = getBookObjFromSQid(source)
    if not newBookObj:
        # delBookById(bookObj['id'])
        myLogging.error(
            'shuqi book has been droped, plz consider to delete id: ' +
            str(bookObj['id']) + ' sid: ' + str(source))
        return
    if newBookObj['chapterNum'] > bookObj['chapterNum']:
        newBookObj['id'] = bookObj['id']
        newChapNum = crawlCapsWithBookObj(bookObj=newBookObj,
                                          bookId=source,
                                          allowUpdate=True)

        if newChapNum >= bookObj['chapterNum']:
            updateOneFieldByOneField('chapterNum', newChapNum, 'id',
                                     bookObj['id'])
            updateBoostWithUpdateTime(bookObj['id'])
            myLogging.info( newBookObj['title'].encode('utf-8') + ' update ' + str(newChapNum - bookObj['chapterNum'])\
                  + ' chaps ')

            if u'连载' != newBookObj['bookType']:
                updateOneFieldByOneField('bookType', newBookObj['bookType'],
                                         'id', bookObj['id'])
                myLogging.warning(newBookObj['title'].encode('utf-8') +
                                  newBookObj['bookType'].encode('utf-8'))
        else:
            myLogging.info(newBookObj['title'].encode('utf-8') +
                           ' has unexcepted, please check. didnot update ')
    else:
        myLogging.info(newBookObj['title'].encode('utf-8') + ' no update ()')
Example #2
0
def cleanSubtitle():
    conn, csor = getDushuConnCsor()
    dictCsor = conn.cursor(MySQLdb.cursors.DictCursor)
    bookId = 2584584
    carry = 50000
    while bookId < 2590000:
        try:
            dictCsor.execute(
                'select id,subtitle  from ' + db_dushu +
                " where id >= %s and id <= %s and subtitle REGEXP '[0-9]{5,20}'",
                (bookId, bookId + carry))
            conn.commit()

            books = dictCsor.fetchallDict()
            for book in books:
                newSubtitle = subTitleClean(book['subtitle'])
                if not newSubtitle == book['subtitle'].encode('utf-8'):
                    myLogging.info('bookId %s update from %s to %s',
                                   book['id'],
                                   book['subtitle'].encode('utf-8'),
                                   newSubtitle)
                    updateOneFieldByOneField('subtitle', newSubtitle, 'id',
                                             book['id'])

        except Exception as e:
            myLogging.warning(e)
        bookId += carry
    chapObj = dictCsor.fetchoneDict()

    csor.close()
    conn.close()
def getLatestUpdateBooks(categorys, limit=30):
    '''
    按bookId和title获取章节信息对象
    :param bookId: 
    :param idx: 
    :return: 
    '''
    conn, csor = getDushuConnCsor()
    dictCsor = conn.cursor(MySQLdb.cursors.DictCursor)

    try:
        dictCsor.execute(
            'select id  from ' + db_dushu + " where categoryCode in %s "
            "and imgUrl != 'http://tata-img.oss-cn-shanghai.aliyuncs.com/book-default.jpg' "
            " order by updateTime desc limit %s", (categorys, limit))
        conn.commit()
    except Exception as e:
        myLogging.warning(e)

    chapObj = dictCsor.fetchallDict()

    csor.close()
    conn.close()

    return chapObj
def updateOneFieldByOneField(upFieldName, upFieldValue, byFieldName,
                             byFieldValue):
    conn, csor = getDushuConnCsor()
    try:
        csor.execute(
            "update " + db_dushu + " set " + upFieldName +
            "  = %s, updateTime =  " + str(int(time.time())) + " where " +
            byFieldName + " = %s", (upFieldValue, byFieldValue))
        conn.commit()
    except Exception as e:
        myLogging.warning('update bookType exception: ' + str(e))

    csor.close()
    conn.close()
def updateBookTypeByRawUrl(type, rawUrl):
    conn, csor = getDushuConnCsor()
    try:
        csor.execute(
            "update " + db_dushu + " set bookType = %s where rawUrl = %s", (
                type,
                rawUrl,
            ))
        conn.commit()
    except Exception as e:
        myLogging.warning('update bookType exception: ' + str(e))

    csor.close()
    conn.close()
Example #6
0
def getIdsByType(confType):
    conn, csor = getDushuConnCsor()

    try:
        csor.execute("select ids from " + db_typeBook + " where type = %s",
                     (confType, ))
        conn.commit()
    except Exception as e:
        myLogging.warning('get bookType exception: ' + str(e))

    ids = csor.fetchone()[0]
    csor.close()
    conn.close()
    return ids
def getLatestChapByBookId(bookId):
    conn, csor = getDushuConnCsor()

    dictCsor = conn.cursor(MySQLdb.cursors.DictCursor)
    try:
        dictCsor.execute(
            "select * from " + db_acticle +
            " where bookId = %s order by id desc limit 1;", (bookId, ))
        conn.commit()
    except Exception as e:
        myLogging.warning('getLatestChapByBookId exception: ' + str(e))
    bookObj = dictCsor.fetchoneDict()
    csor.close()
    conn.close()
    return bookObj
Example #8
0
def fromInvestInt():

    global conn, csor
    if not conn or (not csor):
        conn, csor = getComConnCsor()
    csor.execute(
        "select id,companyName from com_base_copy where id = '6bc7e7ccdb755391651316a0227c059b' and companyName is not Null  limit 10;"
    )
    result = csor.fetchall()
    for comInfo in result:
        uid = comInfo[0]
        cName = comInfo[1]
        if not cName:
            myLogging.warning('no comName skip, uid: %s', uid)
            continue
        getInvestListByNameId(uid, cName)
def getBookCount():
    '''
    获取图书总数
    :param dbid: 
    :return: 
    '''
    conn, csor = getDushuConnCsor()

    try:
        csor.execute("select count(*) from " + db_dushu)
        conn.commit()
    except Exception as e:
        myLogging.warning('update bookType exception: ' + str(e))
    count = csor.fetchone()[0]
    csor.close()
    conn.close()
    return count
def insertBookWithConn(bookObj, allowUpdate=True, conn2=None, csor2=None):

    if not conn2 or not csor2:
        conn2, csor2 = getDushuConnCsor()

    userId = random.randint(1, 50)

    updateTime = int(time.time())

    digest = getBookDigest(bookObj)
    bookObj['digest'] = digest

    #统一清理操作
    bookObj['subtitle'] = subTitleClean(bookObj['subtitle'])

    if not bookObj.has_key('source'):
        bookObj['source'] = ''

    try:
        csor2.execute('insert  ' + db_dushu +
          '(categoryCode,typeCode,category,type,userId,title,subtitle,imgUrl,author,updateTime' \
          ",rawUrl,source,digest,status,viewNum, chapterNum, bookType, size) values" \
          "(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s, %s)" \
          , (bookObj['categoryCode'],bookObj['typeCode'], bookObj['category'], bookObj['type'], userId,bookObj['title']
             ,bookObj['subtitle'],bookObj['imgUrl'],bookObj['author'],updateTime, bookObj['rawUrl']
             ,bookObj['source'],digest, 11,bookObj['viewNum'],bookObj['chapterNum'],bookObj['bookType'],bookObj['size']))
        # csorDoc.execute('update cn_dushu_book set subtitle = %s where digest = %s'
        #   , (bookObj['subtitle'],digest))
        conn2.commit()
        myLogging.info('succ book, ' +
                       unicode(bookObj['title']).encode('utf-8'))
    except Exception, e:
        #     # 发生错误时回滚
        myLogging.warning('update rollback; maybe exists, err:  %s',
                          traceback.format_exc())
        if conn2:
            try:
                conn2.rollback()
            except Exception as ee:
                myLogging.error('rollback error : ' + bookObj['rawUrl'])

        if u'完结' == bookObj['bookType']:
            updateBookTypeByRawUrl(bookObj['bookType'], bookObj['rawUrl'])
            # return None #有bug
        if not allowUpdate:
            return None
Example #11
0
def updateIdsByType(confType, ids):
    conn, csor = getDushuConnCsor()

    try:
        csor.execute(
            "update " + db_typeBook + ' set ids = %s  where type = %s',
            (ids, confType))
        conn.commit()
    except Exception as e:
        myLogging.warning('update bookType exception: ' + str(e))

    csor.close()
    conn.close()
    # return ids


# if __name__ == '__main__':
#     delBookById(227921)
def deleteChapsLargerThanIdx(bookId, idx):
    '''
    删除章节表中所有大于此idx的
    :param bookId: 
    :param idx: 
    :return: 
    '''
    conn, csor = getDushuConnCsor()
    try:
        csor.execute(
            'delete from ' + db_acticle + " where bookId = %s and idx > %s",
            (bookId, idx))
        conn.commit()
    except Exception as e:
        myLogging.warning(e)

    csor.close()
    conn.close()
def getBookObjById(dbid):
    '''
    更加库中主键id获取book对象
    :param dbid: 
    :return: 
    '''
    conn, csor = getDushuConnCsor()
    dictCsor = conn.cursor(MySQLdb.cursors.DictCursor)
    try:
        dictCsor.execute("select * from " + db_dushu + " where id = %s",
                         (dbid, ))
        conn.commit()
    except Exception as e:
        myLogging.warning('update bookType exception: ' + str(e))
    bookObj = dictCsor.fetchoneDict()
    csor.close()
    conn.close()
    return bookObj
def parseBook(allowUpdate, bookObj, zid):

    # categDict = shuqCategory
    zssqStaticUrl = 'http://statics.zhuishushenqi.com/'

    bookObj['zid'] = bookObj['_id']

    bookObj['imgUrl'] = urlparse.urljoin(zssqStaticUrl, bookObj['cover'])
    bookObj['category'] = '其他'
    if bookObj.has_key('majorCate'):
        bookObj['category'] = bookObj['majorCate']

    # bookObj['categoryCode'] = getClassifyCodeByName(bookObj['category'])['categoryCode']

    bookObj['type'] = '其他'
    if bookObj.has_key('minorCate'):
        bookObj['type'] = bookObj['minorCate']
    # bookObj['type'] = bookObj['minorCate']
    bookObj['typeCode'] = 0
    # classfyObj = getClassifyCodeByName(bookObj['type'])
    # if 0 != classfyObj['typeCode']:#二级分类命中的话 一级分类也可以更新掉了
    #     bookObj['typeCode'] = classfyObj['typeCode']
    #     bookObj['categoryCode'] = classfyObj['categoryCode']

    bookObj['categoryCode'], bookObj['typeCode'], bookObj[
        'category'] = getCategoryAndTypeCode(bookObj['category'],
                                             bookObj['type'])

    bookObj['size'] = bookObj['wordCount']
    bookObj['chapterNum'] = bookObj['chaptersCount']

    if bookObj['chapterNum'] < MINCHAPNUM:
        myLogging.warning('chapNum too small, skip %s,  return', str(zid))
        return None

    bookObj['subtitle'] = bookObj['longIntro']
    bookObj['viewNum'] = int(bookObj['latelyFollower']) * 9
    if bookObj['isSerial']:
        bookObj['bookType'] = '连载'
    else:
        bookObj['bookType'] = '完结'

    return bookObj
def getCountDuring(timeStart, timeEnd):
    '''
    获取图书总数
    :param dbid: 
    :return: 
    '''
    conn, csor = getDushuConnCsor()

    try:
        csor.execute(
            "select count(*) from " + db_dushu +
            " where updateTime > %s and updateTime < %s", (timeStart, timeEnd))
        conn.commit()
    except Exception as e:
        myLogging.warning('update bookType exception: ' + str(e))
    count = csor.fetchone()[0]
    csor.close()
    conn.close()
    return count
Example #16
0
def getInvestListByNameId(quid, qCname):
    cookies = {'PHPSESSID': '5dplss3psrev57ad4jk637jph4'}

    if quid in investBloom:
        myLogging.warning('invest aready done before, uid: %s', quid)
        return None

    url = 'http://www.qichacha.com/company_getinfos?unique=' + quid + '&companyname=' + quote(
        qCname.encode('utf-8')) + '&tab=touzi'
    # url = 'http://www.qichacha.com/company_touzi?unique=' + quid + '&companyname=' + quote(qCname.encode('utf-8'))
    resList = []
    while 1:
        htmlContent = getQichachaHtml(url, cookies=cookies)

    soup = getSoupByStrEncode(htmlContent)

    for uidTag in soup.select_one('.list-group-item'):
        uid = uidTag['href'].replace('firm_', '').replace('.shtml',
                                                          '').replace('/', '')
        prv = None
        if '_' in uid:
            strs = uid.split('_')
            prv = strs[0]
            uid = strs[1]
        comName = uidTag.select_one('.text-lg').get_text()
        comObj = dict()
        comObj['uid'] = uid
        comObj['comName'] = comName

        insertWithUid(conn, csor, prv, quid)

        getInvestListByNameId(uid, comName)  #递归下去

        resList.append(comObj)

    # insertWithUid(conn,csor,None,quid)

    #入库
    if len(resList) < 1:
        #没有投资记录
        insertInvestList(quid, '')

    return resList
def updateBoostWithUpdateTime(dbid):
    '''
    根据库中主键id获取book对象
    :param dbid: 
    :return: 
    '''
    conn, csor = getDushuConnCsor()
    dictCsor = conn.cursor(MySQLdb.cursors.DictCursor)
    try:
        dictCsor.execute(
            "update " + db_dushu + " set typeBoost = updateTime where id = %s",
            (dbid, ))
        conn.commit()
    except Exception as e:
        myLogging.warning('update bookType exception: ' + str(e))
    bookObj = dictCsor.fetchoneDict()
    csor.close()
    conn.close()
    return bookObj
Example #18
0
def searchAndCrawlByName(comName, proxy=None):
    if not comName:
        return None
    comName = comName.encode('utf-8')
    # baseUrl = 'http://www.qichacha.com/search?key=' + quote(comName)
    # baseUrl = 'http://www.qichacha.com/firm_CN_ea3a783f0c010fc31a2d75c2c9aa9b75'
    baseUrl = 'http://www.qichacha.com/search?key=%E5%B0%8F%E7%B1%B3'
    ua = random.choice(USER_AGENTS)
    htmlContent = getQichachaHtml(baseUrl, noCookie=True)
    if not htmlContent:
        return None
    soup = getSoupByStrEncode(htmlContent)
    if not soup.select('ul.list-group a') or len(
            soup.select('ul.list-group a')) < 1:
        myLogging.debug(htmlContent)
        return None
    for uidTag in soup.select('ul.list-group a'):
        uid = uidTag['href'].replace('firm_', '')
        if uid == uidTag['href']:
            myLogging.warning('not uid, skip %s', uidTag['href'])
            continue

        uid = uid.replace('.shtml', '').replace('/', '')

        prv = None
        if '_' in uid:
            strs = uid.split('_')
            prv = strs[0]
            uid = strs[1]
        # comName = uidTag.select_one('.text-lg').get_text()
        # comObj = dict()
        # comObj['uid'] = uid
        # comObj['comName'] = comName

        try:
            insertWithUid(conn, csor, prv, uid)
        except Exception as e:
            myLogging.error('insert with uid fail, uid: %s', uid)
        # print comLink
    return 'ok'
def getExistsCapsRawUrlId(bookId):

    conn, csor = getDushuConnCsor()

    checkCapsSql = 'select id,rawUrl from cn_dushu_acticle where bookId = %d' % (
        bookId)
    try:
        csor.execute(checkCapsSql)
        conn.commit()
        results = csor.fetchall()

        if not results or len(results) < 1:
            myLogging.warning('no caps,, bookId:' + str(bookId))
            return None
        else:
            return results
    except Exception as e:
        #     # 发生错误时回滚
        myLogging.error(e)

    csor.close()
    conn.close()
def handleWebsiteNoise(begin, end):

    conn2, csor2 = getDushuConnCsor()

    sql = 'select id,content from cn_dushu_acticle where bookId = 960 and id > ' + str(
        begin) + ' and id < ' + str(end)
    try:
        csor2.execute(sql)
        conn2.commit()
    except Exception as e:
        #     # 发生错误时回滚
        myLogging.warning(e)

    res = csor2.fetchall()
    for cap in res:
        id = cap[0]
        content = cap[1]
        content = re.sub(u'www.{0,15}com', "", content.lower())
        content = re.sub(u'wwww.{0,15}c.{1,2}м', "", content)
        updateContentById(id, content)

    csor2.close()
    conn2.close()
def getChapObjByBookIdChapTitle(bookId, title):
    '''
    按bookId和title获取章节信息对象
    :param bookId: 
    :param idx: 
    :return: 
    '''
    conn, csor = getDushuConnCsor()
    dictCsor = conn.cursor(MySQLdb.cursors.DictCursor)

    try:
        dictCsor.execute(
            'select *  from ' + db_acticle +
            " where bookId = %s and title = %s", (bookId, title))
        conn.commit()
    except Exception as e:
        myLogging.warning(e)

    chapObj = dictCsor.fetchoneDict()

    csor.close()
    conn.close()

    return chapObj
Example #22
0
def updateByBookObj(bookObj):
    latestChapObj = getLatestChapByBookId(bookObj['id'])

    chapName = ''
    chapIdx = 0
    if latestChapObj:
        chapName = latestChapObj['title']
        chapIdx = latestChapObj['idx']

    source = bookObj['source']

    checkUpdateUrl = checkUpdateBaseUrl % source

    payload = {
        'client_chapter_name': chapName.encode('utf-8'),
        'client_bookmark_name': chapName.encode('utf-8'),
        'client_chapter_count': int(chapIdx),
        'client_bookmark_count': int(chapIdx)
    }

    headers = {
        u'User-Agent':
        'Dalvik/2.1.0 (Linux; U; Android 5.1; M3s Build/LMY47I)',
        u'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8'
    }

    r = requests.post(checkUpdateUrl, data=payload, headers=headers)

    resp = r.text

    respJson = json.loads(resp)
    if not respJson['items'] or len(
            respJson['items']) < 1 or respJson['total'] < 1:
        myLogging.info('%s no update, skip', bookObj['id'])
        return
    resIdx = chapIdx
    chapTitles = getChapTitlesByBookId(bookObj['id'])
    chapIdxs = getCapIdxsByBookId(bookObj['id'])
    for chapObj in respJson['items']:
        # if chapObj['serial_number'] <= chapIdx:
        tempIdx = chapObj['serial_number']
        tempTitle = chapObj['name']
        if chapObj[
                'serial_number'] <= chapIdx and tempIdx in chapIdxs and tempTitle in chapTitles:
            continue
        try:
            rewChapIdx = handlChapByBookObjChapObj(chapObj=chapObj,
                                                   bookObj=bookObj,
                                                   allowUpdate=True)
            resIdx = max(resIdx, rewChapIdx)
        except Exception as e:
            myLogging.error('bookId %s chap idx %s has exception: %s',
                            bookObj['id'], chapObj['serial_number'],
                            traceback.format_exc())

    if resIdx > bookObj['chapterNum']:
        updateOneFieldByOneField('chapterNum', resIdx, 'id', bookObj['id'])

        updateBoostWithUpdateTime(bookObj['id'])

        myLogging.info(str(bookObj['id']) + respJson['book']['name'].encode('utf-8') + ' update ' + str(
            resIdx - bookObj['chapterNum']) \
                       + ' chaps ')

        if u'serialize' == respJson['book']['status']:
            newStatus = u'连载'
            if u'FINISH' == respJson['book']['status']:
                newStatus = u'完结'
            updateOneFieldByOneField('bookType', newStatus, 'id',
                                     bookObj['id'])
            myLogging.warning(bookObj['title'].encode('utf-8') +
                              newStatus.encode('utf-8'))
    else:
        myLogging.info(
            str(bookObj['id']) +
            ' has unexcepted, please check. didnot update ')
def crawlCurrentBookObj(mid):


    # url = MianFeiTXTBaseUrl + str(mid)
    url = MianFeiTXTBookBaseUrl + '?' + paramMap().mianfeiTXT().mBookId(mid).mianfeiTXTSign().toUrl()

    baseInfoContent = getContentWithUA(url, ua)
    if not baseInfoContent:
        baseInfoContent = getContentWithUA(url, ua)
    baseObj = json.loads(baseInfoContent)
    baseData = baseObj['data']['book']
    author = baseData['author']
    title = baseData['name']
    coverUrl = baseData['coverUrl']
    # contentUrl = baseData['contentUrl']
    count = baseData['latestChapterCount'] #不准,更新不及时
    if count < MINCHAPNUM:
        myLogging.warning( 'chapNum too small, skip %s,  return', str(mid))
        return None, None
    # isOver = baseData['isOver']
    BookType = baseData['serialStatus']
    # if isOver == 1:
    #     BookType = '完结'
    # bookDetailHtml = getContentWithUA(MianFeiTXTBookDetailUrl + str(mid), ua)
    # bookDetailSoup = getSoupByStr(bookDetailHtml)
    # bookDesc = bookDetailSoup.select_one('#J-desc').get_text().replace('\n', '').replace('\t\t', '\t')
    # bookLabels = []
    # for span in bookDetailSoup.select('#J-lables-items span'):
    #     bookLabels.append(span.get_text())
    bookObj = dict()
    bookObj['subtitle'] = baseData['summary']
    bookObj['source'] = "" + str(mid)
    bookObj['rawUrl'] = MianFeiTXTBaseUrl + str(mid)
    bookObj['title'] = title
    bookObj['chapterNum'] = count #更新不及时
    bookObj['imgUrl'] = 'http://oss-public.antehao.cn/' + coverUrl
    bookObj['author'] = author
    bookObj['size'] = baseData['words']
    bookObj['category'] = baseData['secondCategory']
    # if len(bookLabels) > 0:
    # bookObj['category'] = bookLabels[0]
    bookObj['type'] = baseData['thirdCategory']
    # if len(bookLabels) > 0:
    #     bookObj['type'] = bookLabels[0]
    # if len(bookLabels) > 1:
    #     bookObj['type'] = bookLabels[1]
    bookObj['bookType'] = BookType
    bookObj['categoryCode'], bookObj['typeCode'], bookObj['category'] = getCategoryAndTypeCode(bookObj['category'], bookObj['type'])
    # bookObj['typeCode'] = 0
    # bookObj['categoryCode'] = 0
    bookObj['viewNum'] = random.randint(500000, 1000000)

#获取最新章节下标,作为另一个判断更新的条件
    bookObj['latestCapIndex'] = min(baseData['latestChapterId'], 200000)
    # try:
    #
    #     capExamples = bookDetailSoup.select('.J-category-li')
    #     if capExamples and len(capExamples) > 2:
    #         bookObj['latestCapIndex'] = int(capExamples[2]['id'])#就要第三个,有时候共有3个,有时共有6个
    #
    # except Exception  :
    #     myLogging.warning(traceback.format_exc())

    return bookObj, count