Esempi in Python per getContentWithUA, esempi in Python per util.networkHelper.getContentWithUA

Esempio n. 1

0

Mostra file

File: mianfeiTXTIniter.py Progetto: zyq001/pycrawler-framework

def fromCategoryId(categoryId):
    url = MianFeiTXTChannelBaseUrl + '?' + paramMap().mianfeiTXT().put('secondCategoryId', 101)\
        .put('thirdCategoryId',0).put('filterId', 0).put('sortId',1).put('pageSize', 2000).put('pageNum',1)\
        .mianfeiTXTSign().toUrl()

    baseInfoContent = getContentWithUA(url)
    if not baseInfoContent:
        baseInfoContent = getContentWithUA(url)
    baseObj = json.loads(baseInfoContent)
    for bookObj in baseObj['data']['books']:
        mid = bookObj['id']
        handleByMTID(mid)

Esempio n. 2

0

Mostra file

File: QuanBenMianFeiCrawler.py Progetto: zyq001/pycrawler-framework

def handlChapByBookObjChapObj(allowUpdate, bookObj, chapObj):
    chapContentUrl = chapObj['url']
    chapContent = getContentWithUA(chapContentUrl)
    chapContentObj = json.loads(chapContent)
    if not chapContentObj or not chapContentObj['content'] or len(chapContentObj['content']) < MinChapContentLength:
        myLogging.error('zid %s content too small skip, chapContentUrl %s', bookObj['id'], chapContentUrl)
        return 0

    chapObj.update(chapContentObj)
    chapObj['title'] = chapObj['name']
    chapObj['rawUrl'] = chapContentUrl
    chapObj['idx'] = int(chapObj['serialNumber'])
    del chapObj['serialNumber']
    chapObj['size'] = len(chapObj['content'])
    chapObj['bookId'] = bookObj['id']
    chapObj['source'] = bookObj['source']
    chapObj['bookUUID'] = bookObj['digest']
    digest = getCapDigest(bookObj, chapObj, chapObj['bookChapterId'])
    chapObj['digest'] = digest
    chapObj['content'] = textClean(chapObj['content'])
    capId = insertCapWithCapObj(chapObj, allowUpdate=allowUpdate)
    # aftInsertCap = time.time()
    # insertCap = insertCap + (aftInsertCap - befInsertCap)
    if not capId:
        myLogging.error('no chapId cid %s', chapObj['bookChapterId'])
        return 0
    uploadJson2Bucket(str(chapObj['id']) + '.json', json.dumps(chapObj))

    return chapObj['idx']

Esempio n. 3

0

Mostra file

File: ZhuiShuShenQiCrawler.py Progetto: zyq001/pycrawler-framework

def getBookObjBiZid(zid):

    bookInfoUrl = ZSSQBOOKINFOBASEURL + str(zid)
    bookInfoText = getContentWithUA(bookInfoUrl)
    if not bookInfoText:
        return None
    return json.loads(bookInfoText)

Esempio n. 4

0

Mostra file

File: QuanBenMianFeiCrawler.py Progetto: zyq001/pycrawler-framework

def getSourceId(qid):
    srcUrl = srcListBaseUrl % str(qid)

    srcListContent = getContentWithUA(srcUrl)
    if not srcListContent:
        return
    srcJsonObj = json.loads(srcListContent)
    if not srcJsonObj or not srcJsonObj.has_key('items'):
        myLogging.error('no  srcObj items qid %s', qid)
        return

    srcItems = srcJsonObj['items']

    if len(srcItems.keys()) < 1:
        myLogging.error('  srcObj items len < 1 qid %s', qid)
        return

    if srcItems.has_key('api.zhuishuwang.com'):
        return srcItems['api.zhuishuwang.com'][0]['book_source_id']

    # updateTIme = 0
    # resId = ''
    # for itmkey in srcItems.keys():
    #     if srcItems[itmkey][0]['update_time'] > updateTIme:
    #         resId = srcItems[itmkey][0]['book_source_id']
    #         updateTIme = srcItems[itmkey][0]['update_time']
    #
    # return resId
    raise InputException('no zhuishuwang source, skip')

Esempio n. 5

0

Mostra file

def dealById(baseUrl, conn, csor, id):
    # slp = random.randint(1, 100)
    # time.sleep(0.01 * slp)
    url = baseUrl + str(id) + '.json'
    content = getContentWithUA(url, ua)
    if not content or len(content) < 60:
        print id, 'content', content
        # continue
        return
    jsonObj = json.loads(content)
    data = jsonObj['data'][0]
    if not data or len(str(data)) < 10:
        print id, 'data:', data
        return
        # continue
    companyType = data['companyType']
    webName = data['webName']
    companyName = data['companyName']
    liscense = data['liscense']
    examineDate = data['examineDate'].strip()
    webSite = ','.join(data['webSite'])
    # sql = """insert ignore into com_base (id,companyName,companyType,examineDate,liscense,source,webSite,webName) values (%s,%s,%s,%s,%s,%s,%s,%s);""" % (str(id), companyName, companyType,examineDate, liscense, "tianyacha",webSite,webName)
    try:
        csor.execute(
            """insert ignore into com_base (id,companyName,companyType,examineDate,liscense,source,webSite,webName) values (%s,%s,%s,%s,%s,%s,%s,%s);""",
            (str(id), companyName, companyType, examineDate, liscense,
             "tianyacha", webSite, webName))
        conn.commit()
    except Exception as e:
        #     # 发生错误时回滚
        print e

Esempio n. 6

0

Mostra file

File: ZhuiShuShenQiCrawler.py Progetto: zyq001/pycrawler-framework

def getChapsByBocId(bocId):
    chapListUrl = 'http://api.zhuishushenqi.com/btoc/%s?view=chapters' % (
        bocId)
    chapsText = getContentWithUA(chapListUrl)
    # if not chapsText:
    # return
    chapListObj = json.loads(chapsText)
    return chapListObj

Esempio n. 7

0

Mostra file

def getContentByUrl(url):
    capText = getContentWithUA(url, ua)

    if not (capText and len(capText) > 30):
        print 'cap content too short ,skip and del book'
        return None
    capRoot = ElementTree.fromstring(capText.encode('utf-8'))

    # ChapterName = ''
    # if len(capRoot.getiterator('ChapterName')) > 0:
    #     ChapterName = capRoot.getiterator('ChapterName')[0].text

    ChapterContent = ''
    if len(capRoot.getiterator('ChapterContent')) > 0:
        ChapterContent = capRoot.getiterator('ChapterContent')[0].text

    # if ('http://' in ChapterContent and len(ChapterContent) < 250):
    #     print 'cap content is url ,skip and del book', bookId, ' : ', ChapterContent
    #     delBookById(bookId)
    #     return None
    WordsCount = ''
    # if len(capRoot.getiterator('WordsCount')) > 0:
    #     WordsCount = capRoot.getiterator('WordsCount')[0].text

    PageCount = 1
    if len(capRoot.getiterator('PageCount')) > 0:
        PageCount = int(capRoot.getiterator('PageCount')[0].text)
    if PageCount > 1:
        for i in range(2, PageCount + 1):
            pageIndex = i
            capApi2 = url.replace('pageIndex=' + str(pageIndex - 1),'pageIndex=' + str(pageIndex - 1))#capListAPIBase + str(bookId) + '&chapterid=' + str(capId) + '&pageIndex=' + str(
                # pageIndex) + '&bg=0' + capListAPIDeviceInfo
            capText2 = getContentWithUA(capApi2, ua)

            if not (capText2 and len(capText2) > 160):
                return
            capRoot2 = ElementTree.fromstring(capText2.encode('utf-8'))
            ChapterContent2 = ''
            if len(capRoot2.getiterator('ChapterContent')) > 0:
                ChapterContent2 = capRoot2.getiterator('ChapterContent')[0].text
            if ChapterContent == ChapterContent2:
                break
            ChapterContent = ChapterContent + ChapterContent2
    return ChapterContent

Esempio n. 8

0

Mostra file

def startFromCId():
    baseUrl = 'http://api.shuqireader.com/reader/bc_storylist.php?pagesize=40&PageType=category&item=allclick&pageIndex='
    cc = '&cid='
    page = 1
    shuqCategory = loadShuQSeqC()
    # shuqCategory2 = loadShuQC()
    # totleSize = 220



    for cid in shuqCategory.keys():

        try:

            url = baseUrl + str(page) + cc + str(cid) + capListAPIDeviceInfo

            urlContent = getContentWithUA(url, ua)

            if not (urlContent and len(urlContent) > 30):
                continue

            capRoot = ElementTree.fromstring(urlContent.encode('utf-8'))

            totleSize = int(capRoot.attrib['TotalCount']) / 40 + 1

            try:
                dealBookListPrintBooks(urlContent)

                for page in range(totleSize, 0, -1):
                    url = baseUrl + str(page) + cc + str(cid) + capListAPIDeviceInfo

                    urlContent = getContentWithUA(url,ua)

                    if not (urlContent and len(urlContent) > 30):
                        continue

                    dealBookListPrintBooks( urlContent)
            except Exception as e1:
                print 'deal one page error, cid: ',cid,' page: ' ,page
        except Exception as e:
            print "cid : ", cid, 'error: ',e

Esempio n. 9

0

Mostra file

File: ZssqSearcher.py Progetto: zyq001/pycrawler-framework

def search(searchInput):

    if isinstance(searchInput, unicode):
        searchInput = searchInput.encode('utf-8')

    url = ZSSQSEARCHBASEURL + quote(searchInput)
    searchResContent = getContentWithUA(url)
    if not searchResContent:
        return None
    searchResObj = json.loads(searchResContent)
    if not searchResObj or not searchResObj.has_key('books'):
        return
    return searchResObj

Esempio n. 10

0

Mostra file

def getContentFromXml(bookId, capId, xml):
    pageIndex = 1
    capListAPIBase = 'http://api.shuqireader.com/reader/bc_showchapter.php?bookId='

    # capApi = capListAPIBase + str(bookId) + '&chapterid=' + str(capId) + '&pageIndex=' + str(
    #     pageIndex) + '&bg=0' + capListAPIDeviceInfo
    # capText = getContentWithUA(capApi, ua)

    # if not (capText and len(capText) > 30):
    #     print 'cap content too short ,skip and del book'
    #     delBookById(bookId)
    #     return None
    capRoot = ElementTree.fromstring(xml.encode('utf-8'))

    # ChapterName = ''
    # if len(capRoot.getiterator('ChapterName')) > 0:
    #     ChapterName = capRoot.getiterator('ChapterName')[0].text

    ChapterContent = ''
    if len(capRoot.getiterator('ChapterContent')) > 0:
        ChapterContent = capRoot.getiterator('ChapterContent')[0].text

    # if ('http://' in ChapterContent and len(ChapterContent) < 250):
    #     print 'cap content is url ,skip and del book', bookId, ' : ', ChapterContent
    #     delBookById(bookId)
    #     return None
    WordsCount = ''
    # if len(capRoot.getiterator('WordsCount')) > 0:
    #     WordsCount = capRoot.getiterator('WordsCount')[0].text

    PageCount = 1
    if len(capRoot.getiterator('PageCount')) > 0:
        PageCount = int(capRoot.getiterator('PageCount')[0].text)
    if PageCount > 1:
        for i in range(2, PageCount + 1):
            pageIndex = i
            capApi2 = capListAPIBase + str(bookId) + '&chapterid=' + str(capId) + '&pageIndex=' + str(
                pageIndex) + '&bg=0' + capListAPIDeviceInfo
            capText2 = getContentWithUA(capApi2, ua)

            if not (capText2 and len(capText2) > 160):
                return
            capRoot2 = ElementTree.fromstring(capText2.encode('utf-8'))
            ChapterContent2 = ''
            if len(capRoot2.getiterator('ChapterContent')) > 0:
                ChapterContent2 = capRoot2.getiterator('ChapterContent')[0].text
            ChapterContent = ChapterContent + ChapterContent2

    return ChapterContent

Esempio n. 11

0

Mostra file

def startFromLatestAjax():
    baseUrl = 'http://ajax.shuqiapi.com/?bamp=sqphcm&desc_type=3&page='
    tailUrl = '&tk=NDE3YWM1OWU5Zg%253D%253D'
    page = 1
    import json
    for page in range(86, 120):
        url = baseUrl + str(page) + tailUrl

        jsonContent = getContentWithUA(url,ua)
        jsonC = json.loads(jsonContent.encode('utf-8'))

        for book in jsonC['data']['ph']['book_list']:
            bookId = book['id']
            try:
                start(bookId)
            except Exception as e:
                print 'book ',bookId,' error: ', e

Esempio n. 12

0

Mostra file

def initCap():
    sqCat = dict()
    for i in range(0,800):
        url = 'http://api.shuqireader.com/reader/bc_storylist.php?pagesize=40&PageType=category&item=allclick' \
              '&pageIndex=1&cid=' \
              + str(i) + capListAPIDeviceInfo
        text = getContentWithUA(url, ua)

        if not (text and len(text) > 60 ):
            continue

        root = ElementTree.fromstring(text.encode('utf-8'))

    # 获取element的方法
    # 1 通过getiterator
        node = root.getiterator("Book")[0]

        parentName = node.attrib['ParentTypeName']
        ParentTypeId = node.attrib['ParentTypeId']
        TypeName = node.attrib['TypeName']

        print TypeName,ParentTypeId, parentName

        tag = dict()
        tag['TypeName'] = TypeName
        # tag['parentName'] = parentName
        # tag['ParentTypeId'] = ParentTypeId
        tag['cid'] = i

        if not sqCat.has_key(parentName):

            chidren = []
            chidren.append(tag)
            top = dict()
            top['id'] = ParentTypeId
            top['children'] = chidren
            sqCat[parentName] = top
        else:
            sqCat[parentName]['children'].append(tag)
        # sqCat[i] = tag

    f = open('shuqCategory.yaml', 'wb')
    yaml.dump(sqCat, f)
    f.close()

Esempio n. 13

0

Mostra file

File: startMianFeiFixer.py Progetto: zyq001/pycrawler-framework

def changeSouceIds():
    bookObjs = getMianAllBookBaseObjs()
    for bookObj in bookObjs:
        try:
            foundNewId = False
            title = bookObj['title']
            author = bookObj['author']
            source = bookObj['source']
            bookId = bookObj['id']

            searchUrl = MianFeiTXTSearchBaseUrl + '?' + paramMap().mianfeiTXT()\
                .put('keyword', (title + author).encode('utf-8'))\
                .put('pageSize', '10').put('pageNum', '1').put('type', '1')\
                .mianfeiTXTSign() \
                .toUrl()

            # time.sleep(random.)
            r = requests.get(searchUrl)

            searchRes = json.loads(r.text)
            for resBook in searchRes['data']['books']:
                resTitle = resBook['name']
                if resTitle != title:
                    continue
                resAuthor = resBook['author']
                if resAuthor != author:
                    continue

                resId = resBook['id']

                if str(resId) == str(source):
                    myLogging.info('WTF: id no change?, bookId: %s, orgSoueceId: %s,  newId: %s', bookId, source, resId)

                latestChapObj = getLatestChapByBookId(bookId)
                if not latestChapObj:
                    myLogging.error('no chaps in db yet, bookId: %s, new mid: %s', bookId, resId)
                    updateOneFieldByOneField('source', resId, 'id', bookId)
                    foundNewId = True
                    break

                cid = latestChapObj['idx']
                chapTitle = latestChapObj['title']

                capContentUrl = MianFeiTXTChapBaseUrl + '?' + paramMap().mianfeiTXT().mBookId(resId).mChapId(
                    cid).mianfeiTXTSign().toUrl()

                capContent = getContentWithUA(capContentUrl)
                if not capContent:
                    capContent = getContentWithUA(capContentUrl)
                # capContent = capContent.replace(r'\r', '').replace(r'\n', '')
                capListJsonObj = json.loads(capContent, strict=False)
                if not (capListJsonObj['returnCode'] == '0000'):
                    capListJsonObj = json.loads(capContent)
                    if not (capListJsonObj['returnCode'] == '0000' and capListJsonObj['returnMsg'] == u'成功'):
                        myLogging.error('get chap detail fail mid: %s, cid: %s', resId, cid)
                        continue

                chapterName = capListJsonObj['data']['bookChapter']['chapterName']
                if chapterName == chapTitle:
                    myLogging.info('bookId %s change source  from %s to %s', bookId, source, resId)
                    updateOneFieldByOneField('source', resId, 'id', bookId)
                    foundNewId = True
                    break
            if not foundNewId:
                myLogging.error('bookId %s did not find new id !!!,title: %s, author: %s, org source: %s', bookId, title, author,source )
        except Exception as e:
            myLogging.error(traceback.format_exc())

Esempio n. 14

0

Mostra file

File: ZhuiShuShenQiCrawler.py Progetto: zyq001/pycrawler-framework

def handlChapsByBookObjZidBocId(bookObj, zid, chapListObj, allowUpdate=False):
    # chapListObj = getChapsByBocId(bocId)
    resInx = 0  #保存最终更新到的下标
    # chapListObj = getChapObjs(bookObj)
    if not chapListObj:
        myLogging.error('zid %s get chaps list null', zid)
        return resInx
    if not chapListObj.has_key('chapters'):
        myLogging.error('zid %s chaps list no data', zid)
        return resInx
    capIdxs = set()
    capTitles = set()
    if allowUpdate:
        capIdxs = getCapIdxsByBookId(bookObj['id'])  # 已在库中的章节下标
        capTitles = getChapTitlesByBookId(bookObj['id'])  # 已在库中的章节下标
    for idx in range(0, len(chapListObj['chapters'])):
        try:
            # if idx in capIdxs:
            #     continue

            chapObj = chapListObj['chapters'][idx]

            if chapObj['title'] in capTitles:
                continue
            if idx in capIdxs:
                continue
            chapObj['cid'] = chapObj['link']
            if chapObj.has_key('id'):
                chapObj['cid'] = chapObj['id']
            chapObj['idx'] = idx

            chapContentUrl = ZSSQCHAPCONTENTBASEURL + quote(chapObj['link'])
            chapContentText = getContentWithUA(chapContentUrl)
            if not chapContentText:
                myLogging.error(
                    'zid: %s, dbid: %s, chapId: %s, get chapContent null ',
                    zid, bookObj['id'], chapObj['cid'])
                continue
            chapContentObj = json.loads(chapContentText)
            if not chapContentObj or not chapContentObj.has_key('chapter'):
                myLogging.error(
                    'zid: %5, dbid: %s, chapId: %s, get no chapter ', zid,
                    bookObj['id'], chapObj['cid'])
                continue
            if u'.' == chapContentObj['chapter']['title'] or len(
                    chapContentObj['chapter']['title']) < 2:
                del chapContentObj['chapter']['title']
            chapObj.update(chapContentObj['chapter'])

            chapObj['content'] = chapObj['body']
            if chapObj.has_key('cpContent'):
                chapObj['content'] = chapObj['cpContent']
                del chapObj['cpContent']
            chapObj['content'] = textClean(chapObj['content'])

            if len(chapObj['content']) < MinChapContentLength:
                myLogging.error('zid %s cid %s content too small skip', zid,
                                chapObj['cid'])
                continue

            del chapObj['body']
            del chapObj['link']
            chapObj['rawUrl'] = chapContentUrl
            # capObj['size'] = int(WordsCount)
            chapObj['size'] = len(chapObj['content'])
            chapObj['bookId'] = bookObj['id']
            chapObj['source'] = bookObj['source']
            chapObj['bookUUID'] = bookObj['digest']

            digest = getCapDigest(bookObj, chapObj, chapObj['cid'])
            chapObj['digest'] = digest

            capId = insertCapWithCapObj(chapObj)

            # aftInsertCap = time.time()
            # insertCap = insertCap + (aftInsertCap - befInsertCap)

            if not capId:
                continue
            uploadJson2Bucket(str(capId) + '.json', json.dumps(chapObj))

            resInx = max(resInx, idx)
            # aftUploadCap = time.time()
            # uploadCap = uploadCap + (aftUploadCap - aftInsertCap)
        except Exception as e:
            myLogging.error('zid: %, dbid: %s, idx: %s, get exception ', zid,
                            bookObj['id'], idx)
            myLogging.error(traceback.format_exc())
    return resInx

Esempio n. 15

0

Mostra file

File: QuanBenMianFeiCrawler.py Progetto: zyq001/pycrawler-framework

def getBookObjBiQid(qid, srcId = None, allowUpdate=False):
    if not srcId:
        srcId = getSourceId(qid)

    # categDict = shuqCategory

    bookInfoUrl = bookInfoBaseUrl % (qid, srcId)
    bookInfoContent = getContentWithUA(bookInfoUrl)
    bookInfoObj = json.loads(bookInfoContent)
    bookObj = bookInfoObj['items'][0]
    bookObj['title'] = bookObj['name']
    bookObj['subtitle'] = bookObj['desc']
    bookObj['imgUrl'] = checkDefaultImg(bookObj['img_url'])

    if bookObj['status'] == 'SERIALIZE':
        bookObj['bookType'] = u'连载'
    else:
        bookObj['bookType'] = u'完结'

    bookObj['rawUrl'] = bookInfoUrl

    bookObj['category'] = bookObj['labels']
    bookObj['categoryCode'] = getClassifyCodeByName(bookObj['category'])['categoryCode']

    # if categDict.has_key(bookObj['category']):
    #     if categDict[bookObj['category']]['id'] and len(categDict[bookObj['category']]['id']) > 0:
    #         bookObj['categoryCode'] = int(categDict[bookObj['category']]['id'])

    bookObj['type'] = bookObj['category']
    bookObj['typeCode'] = 0
    classObj = getClassifyCodeByName(bookObj['type'])
    if 0 != classObj['typeCode']:
        bookObj['typeCode'] = classObj['typeCode']
        bookObj['categoryCode'] = classObj['categoryCode']

    bookObj['source'] = qid + '/' + srcId

    chapListUrl = chapListBaseUrl % (qid, srcId)

    chapListContent = getContentWithUA(chapListUrl)

    chapListObj = json.loads(chapListContent)

    chapNum = len(chapListObj['items'])
    bookObj['chapterNum'] = chapNum

    if bookObj['chapterNum'] < MINCHAPNUM:
        myLogging.error('chap num too small skip, bookId %s', qid)
        return
    bookObj['size'] = chapNum * random.randint(1000, 3000)
    bookObj['viewNum'] = chapNum * random.randint(20000, 30000)

    bookObj = insertBookWithConn(bookObj, allowUpdate)

    if not bookObj:
        myLogging.error('null bookObj after insert Book to db, bookId %s', qid)
        return

    for chapObj in chapListObj['items']:

        try:
            handlChapByBookObjChapObj(allowUpdate, bookObj, chapObj)

        except Exception as e:
            myLogging.error(traceback.format_exc())

Esempio n. 16

0

Mostra file

def getShuqiCapList(bookId):

    capList = []

    pageIndex = 1
    capListAPIBase = 'http://api.shuqireader.com/reader/bc_chapter.php?pagesize=40&bookId='

    capListAPI = capListAPIBase + str(bookId) + '&pageIndex=' + str(pageIndex) + capListAPIDeviceInfo

    text = getContentWithUA(capListAPI, ua)

    if not (text and len(text) > 160):
        return

    root = ElementTree.fromstring(text.encode('utf-8'))

    gatherId = root.getiterator('BookInfos')[0].attrib['GatherId']
    TotalCount = int(root.getiterator('BookInfos')[0].attrib['TotalCount'])
    # if TotalCount > 40:
    topPageCount = TotalCount / 40 + 2#分页的总数
    for i in range(1, topPageCount):#如果没有分页，i只会等于1
        if i == 1:
            pageRoot = root
        else:
            pageApi = capListAPIBase + str(bookId) + '&pageIndex=' + str(i) + capListAPIDeviceInfo
            pageText = getContentWithUA(pageApi, ua)

            if not (pageText and len(pageText) > 160):
                return
            pageRoot = ElementTree.fromstring(pageText.encode('utf-8'))

        if gatherId and gatherId != '':#有子目录
            for book in pageRoot.getiterator('Book'):
                vId = book.attrib['ChapterId']

                secondApi = capListAPI + '&vid=' + str(vId)#子目录的url
                textSon = getContentWithUA(secondApi, ua)#子目录的内容

                xmlSon = ElementTree.fromstring(textSon.encode('utf-8')) #子目录的xml

                sonTotalCount = int(xmlSon.getiterator('BookInfos')[0].attrib['TotalCount']) #子目录的记录总数
                sonPageCount = sonTotalCount / 40 + 2#子目录的分页总数
                for j in range(1, sonPageCount):#遍历子目录的每一页，如果没有分页，会只遍历第一页
                    if j == 1:
                        sonpageRoot = xmlSon#第一页不需要再请求url
                    else:
                        morePageApi = capListAPIBase + str(bookId) + '&pageIndex=' + str(j) + capListAPIDeviceInfo \
                                  + '&vid=' + str(vId)#子目录的分页url
                        morePageText = getContentWithUA(morePageApi, ua)
                        sonpageRoot = ElementTree.fromstring(morePageText.encode('utf-8'))  # 子目录的xml

                    for realCap in sonpageRoot.getiterator('Book'):
                        realCapId = realCap.attrib['ChapterId']
                        chapTitle = realCap.attrib['BookChapter']
                        chapObj = {"cid": realCapId, 'title': chapTitle}
                        # dealCap(bookId, realCapId)
                        capList.append(chapObj)

        else:#没有二级目录，不需要请求信的api，所有不需考虑分页
            for realCap in pageRoot.getiterator('Book'):
                realCapId = realCap.attrib['ChapterId']
                chapTitle = realCap.attrib['BookChapter']
                chapObj = {"cid": realCapId, 'title': chapTitle}
                # dealCap(bookId, realCapId)
                capList.append(chapObj)

    return capList

Esempio n. 17

0

Mostra file

File: ZhuiShuShenQiCrawler.py Progetto: zyq001/pycrawler-framework

def getBocObjsByZid(zid):
    getbocBaseUrl = 'http://api.zhuishushenqi.com/atoc?view=summary&book='
    botText = getContentWithUA(getbocBaseUrl + zid)
    bocObjs = json.loads(botText)
    return bocObjs

Esempio n. 18

0

Mostra file

def getCapContentObj(bookId, capId,mysqlBKid):
    pageIndex = 1
    capListAPIBase = 'http://api.shuqireader.com/reader/bc_showchapter.php?bookId='

    capApi = capListAPIBase + str(bookId) + '&chapterid=' + str(capId) + '&pageIndex=' + str(pageIndex) \
             + '&bg=0' + capListAPIDeviceInfo
    capText = getContentWithUA(capApi, ua)

    capObj = dict()
    capObj['bookFail'] = False #标识是否整本书不可抓，如果是就没必要抓后面的章节了
    if not capText :
        print 'cap content none'
        return None
    if not len(capText) > 30:
        print 'cap content too short ,skip and del book'
        delBookById(mysqlBKid)
        capObj['bookFail'] = True
        return capObj
    capRoot = ElementTree.fromstring(capText.encode('utf-8'))

    ChapterName = ''
    if len(capRoot.getiterator('ChapterName')) > 0:
        ChapterName = capRoot.getiterator('ChapterName')[0].text

    ChapterContent = ''
    if len(capRoot.getiterator('ChapterContent')) > 0:
        ChapterContent = capRoot.getiterator('ChapterContent')[0].text
    if not ChapterContent:
        capText = getContentWithUA(capApi, ua)

        if not capText:
            print 'cap content none'
            return None
        if not len(capText) > 30:
            print 'cap content too short ,skip and del book'
            delBookById(mysqlBKid)
            capObj['bookFail'] = True
            return capObj

        capRoot = ElementTree.fromstring(capText.encode('utf-8'))

        ChapterName = ''
        if len(capRoot.getiterator('ChapterName')) > 0:
            ChapterName = capRoot.getiterator('ChapterName')[0].text

        ChapterContent = ''
        if len(capRoot.getiterator('ChapterContent')) > 0:
            ChapterContent = capRoot.getiterator('ChapterContent')[0].text
    if not ChapterContent:
        return None
    ChapterContent = ChapterContent.strip()

    if(ChapterContent.startswith('http') and len(ChapterContent) < 250):
        print 'cap content is url ,skip and del book', bookId, ' : ',ChapterContent
        delBookById(mysqlBKid)
        capObj['bookFail'] = True
        return capObj
    WordsCount = ''
    if len(capRoot.getiterator('WordsCount')) > 0:
        WordsCount = capRoot.getiterator('WordsCount')[0].text

    PageCount = 1
    if len(capRoot.getiterator('PageCount')) > 0:
        PageCount = int(capRoot.getiterator('PageCount')[0].text)
    if PageCount > 1:
        for i in range(2, PageCount + 1):
            pageIndex = i
            capApi2 = capListAPIBase + str(bookId) + '&chapterid=' + str(capId) + '&pageIndex=' + str(
                pageIndex) + '&bg=0' + capListAPIDeviceInfo
            capText2 = getContentWithUA(capApi2, ua)

            if not (capText2 and len(capText2) > 160):
                return
            capRoot2 = ElementTree.fromstring(capText2.encode('utf-8'))
            ChapterContent2 = ''
            if len(capRoot2.getiterator('ChapterContent')) > 0:
                ChapterContent2 = capRoot2.getiterator('ChapterContent')[0].text
            ChapterContent = ChapterContent + ChapterContent2

    capObj['content'] = ChapterContent.replace(u'***求收藏***','').replace(u'***（求收藏）***','').replace(u'求收藏','')
    capObj['title'] = ChapterName
    capObj['rawUrl'] = capApi[0:200]
    # capObj['size'] = int(WordsCount)
    capObj['size'] = len(capObj['content'])
    return capObj

Esempio n. 19

0

Mostra file

File: mianfeiTXTCrawler.py Progetto: hytsang/pycrawler-framework

def handleCapsByBookObj(allowUpdate, bookObj, count, mid, startCapIdx = 1):
    capIdxs = set()
    if allowUpdate:
        capIdxs = getCapIdxsByBookId(bookObj['id'])  # 已在库中的章节下标

    # myBookId = bookObj['id']
    #
    # startCap = time.time()
    crawlParseSpent = 0
    insertCap = 0
    uploadCap = 0
    succCapTimes = 1
    resIdx = startCapIdx
    for cid in range(0, count + 1):
        try:

            if allowUpdate:
                if cid in capIdxs:
                    continue  # 该章节已在库中，跳过
                # else:
                #     startCap = time.time()

            befCrawl = time.time()
            succCapTimes = succCapTimes + 1

            # capContentUrl = MianFeiContentBaseUrl + str(cid) + '&contentid=' + str(mid)
            capContentUrl = MianFeiTXTChapBaseUrl + '?' + paramMap().mianfeiTXT().mBookId(bookObj['source']).mChapId(
                cid).mianfeiTXTSign().toUrl()

            capContent = getContentWithUA(capContentUrl, ua)
            if not capContent:
                capContent = getContentWithUA(capContentUrl, ua)
            # capContent = capContent.replace(r'\r', '').replace(r'\n', '')
            capListJsonObj = json.loads(capContent, strict=False)
            if not (capListJsonObj['returnCode'] == '0000'):
                capListJsonObj = json.loads(capContent)
                if not (capListJsonObj['returnCode'] == '0000' and capListJsonObj['returnMsg'] == u'成功'):
                    resIdx = min(cid, resIdx)
                    myLogging.info('chap content null ,RETURN, capId:' + str(cid) + ' mid: ' + str(mid))
                    return resIdx  # 原api接口更新不及时，为了配合后来的 无限向前重试方法，在这跳出

            capObj = dict()
            orgContent = capListJsonObj['data']['bookChapter']['content']
            contentSoup = getSoupByStr(orgContent)
            if not contentSoup or '' == orgContent or len(orgContent) < 1:
                myLogging.error('chap content null ,RETURN, capId:' + str(cid) + ' mid: ' + str(mid))
                resIdx = min(cid, resIdx)
                return resIdx #原api接口更新不及时，为了配合后来的 无限向前重试方法，在这跳出

            if contentSoup.body['style']:
                del contentSoup.body['style']
            content = unicode(contentSoup.body).replace(u'<body>', '').replace(u'</body>', '').replace(u'\n\n',
                                                                                                       u'\n').replace(
                u'<br><br>', u'<br>').replace(u'<br\><br\>', u'<br\>')
            capObj['content'] = textClean(content)
            capObj['title'] = unicode(contentSoup.title.get_text())
            capObj['rawUrl'] = capContentUrl
            # capObj['size'] = int(WordsCount)
            capObj['size'] = len(content)
            capObj['bookId'] = bookObj['id']
            capObj['source'] = bookObj['source']
            capObj['idx'] = cid
            capObj['bookUUID'] = bookObj['digest']

            digest = getCapDigest(bookObj, capObj, cid)

            capObj['digest'] = digest

            befInsertCap = time.time()
            crawlParseSpent = crawlParseSpent + (befInsertCap - befCrawl)

            capId = insertCapWithCapObj(capObj)

            aftInsertCap = time.time()
            insertCap = insertCap + (aftInsertCap - befInsertCap)

            if not capId:
                continue
            uploadJson2Bucket(str(capObj['id']) + '.json', json.dumps(capObj))

            aftUploadCap = time.time()
            uploadCap = uploadCap + (aftUploadCap - aftInsertCap)
            resIdx = max(cid, resIdx)
        except Exception as e:
            myLogging.error('crawl' + str(mid) + ' cap ' + str(cid) + ' exception: ' + str(e))
            resIdx = min(cid, resIdx)
    if succCapTimes > 1:
        succCapTimes = succCapTimes - 1
    myLogging.info( 'crawlParse avg: ' + str(float(crawlParseSpent) / float(succCapTimes)) + \
        ' insert avg: ' + str(float(insertCap) / float(succCapTimes)) + \
        ' upload avg: ' + str(float(uploadCap) / float(succCapTimes)))
    return resIdx

Esempio n. 20

0

Mostra file

File: mianfeiTXTCrawler.py Progetto: hytsang/pycrawler-framework

def crawlCurrentBookObj(mid):


    # url = MianFeiTXTBaseUrl + str(mid)
    url = MianFeiTXTBookBaseUrl + '?' + paramMap().mianfeiTXT().mBookId(mid).mianfeiTXTSign().toUrl()

    baseInfoContent = getContentWithUA(url, ua)
    if not baseInfoContent:
        baseInfoContent = getContentWithUA(url, ua)
    baseObj = json.loads(baseInfoContent)
    baseData = baseObj['data']['book']
    author = baseData['author']
    title = baseData['name']
    coverUrl = baseData['coverUrl']
    # contentUrl = baseData['contentUrl']
    count = baseData['latestChapterCount'] #不准，更新不及时
    if count < MINCHAPNUM:
        myLogging.warning( 'chapNum too small, skip %s,  return', str(mid))
        return None, None
    # isOver = baseData['isOver']
    BookType = baseData['serialStatus']
    # if isOver == 1:
    #     BookType = '完结'
    # bookDetailHtml = getContentWithUA(MianFeiTXTBookDetailUrl + str(mid), ua)
    # bookDetailSoup = getSoupByStr(bookDetailHtml)
    # bookDesc = bookDetailSoup.select_one('#J-desc').get_text().replace('\n', '').replace('\t\t', '\t')
    # bookLabels = []
    # for span in bookDetailSoup.select('#J-lables-items span'):
    #     bookLabels.append(span.get_text())
    bookObj = dict()
    bookObj['subtitle'] = baseData['summary']
    bookObj['source'] = "" + str(mid)
    bookObj['rawUrl'] = MianFeiTXTBaseUrl + str(mid)
    bookObj['title'] = title
    bookObj['chapterNum'] = count #更新不及时
    bookObj['imgUrl'] = 'http://oss-public.antehao.cn/' + coverUrl
    bookObj['author'] = author
    bookObj['size'] = baseData['words']
    bookObj['category'] = baseData['secondCategory']
    # if len(bookLabels) > 0:
    # bookObj['category'] = bookLabels[0]
    bookObj['type'] = baseData['thirdCategory']
    # if len(bookLabels) > 0:
    #     bookObj['type'] = bookLabels[0]
    # if len(bookLabels) > 1:
    #     bookObj['type'] = bookLabels[1]
    bookObj['bookType'] = BookType
    bookObj['categoryCode'], bookObj['typeCode'], bookObj['category'] = getCategoryAndTypeCode(bookObj['category'], bookObj['type'])
    # bookObj['typeCode'] = 0
    # bookObj['categoryCode'] = 0
    bookObj['viewNum'] = random.randint(500000, 1000000)

#获取最新章节下标，作为另一个判断更新的条件
    bookObj['latestCapIndex'] = min(baseData['latestChapterId'], 200000)
    # try:
    #
    #     capExamples = bookDetailSoup.select('.J-category-li')
    #     if capExamples and len(capExamples) > 2:
    #         bookObj['latestCapIndex'] = int(capExamples[2]['id'])#就要第三个，有时候共有3个，有时共有6个
    #
    # except Exception  :
    #     myLogging.warning(traceback.format_exc())

    return bookObj, count