Python getContentWithUA Exemples, networkHelper.getContentWithUA Python Exemples

Exemple #1

0

Afficher le fichier

Fichier : shuqi2.py Projet : zyq001/pycrawler

def getContentByUrl(url):
    capText = getContentWithUA(url, ua)

    if not (capText and len(capText) > 30):
        print 'cap content too short ,skip and del book'
        return None
    capRoot = ElementTree.fromstring(capText.encode('utf-8'))

    # ChapterName = ''
    # if len(capRoot.getiterator('ChapterName')) > 0:
    #     ChapterName = capRoot.getiterator('ChapterName')[0].text

    ChapterContent = ''
    if len(capRoot.getiterator('ChapterContent')) > 0:
        ChapterContent = capRoot.getiterator('ChapterContent')[0].text

    # if ('http://' in ChapterContent and len(ChapterContent) < 250):
    #     print 'cap content is url ,skip and del book', bookId, ' : ', ChapterContent
    #     delBookById(bookId)
    #     return None
    WordsCount = ''
    # if len(capRoot.getiterator('WordsCount')) > 0:
    #     WordsCount = capRoot.getiterator('WordsCount')[0].text

    PageCount = 1
    if len(capRoot.getiterator('PageCount')) > 0:
        PageCount = int(capRoot.getiterator('PageCount')[0].text)
    if PageCount > 1:
        for i in range(2, PageCount + 1):
            pageIndex = i
            capApi2 = url.replace(
                'pageIndex=' + str(pageIndex - 1),
                'pageIndex=' + str(pageIndex - 1)
            )  #capListAPIBase + str(bookId) + '&chapterid=' + str(capId) + '&pageIndex=' + str(
            # pageIndex) + '&bg=0' + capListAPIDeviceInfo
            capText2 = getContentWithUA(capApi2, ua)

            if not (capText2 and len(capText2) > 160):
                return
            capRoot2 = ElementTree.fromstring(capText2.encode('utf-8'))
            ChapterContent2 = ''
            if len(capRoot2.getiterator('ChapterContent')) > 0:
                ChapterContent2 = capRoot2.getiterator(
                    'ChapterContent')[0].text
            if ChapterContent == ChapterContent2:
                break
            ChapterContent = ChapterContent + ChapterContent2
    return ChapterContent

Exemple #2

0

Afficher le fichier

Fichier : qichacha.py Projet : cash2one/pycrawler

def dealById(baseUrl, conn, csor, id):
    # slp = random.randint(1, 100)
    # time.sleep(0.01 * slp)
    url = baseUrl + str(id) + '.json'
    content = getContentWithUA(url, ua)
    if not content or len(content) < 60:
        print id, 'content', content
        # continue
        return
    jsonObj = json.loads(content)
    data = jsonObj['data'][0]
    if not data or len(str(data)) < 10:
        print id, 'data:', data
        return
        # continue
    companyType = data['companyType']
    webName = data['webName']
    companyName = data['companyName']
    liscense = data['liscense']
    examineDate = data['examineDate'].strip()
    webSite = ','.join(data['webSite'])
    # sql = """insert ignore into com_base (id,companyName,companyType,examineDate,liscense,source,webSite,webName) values (%s,%s,%s,%s,%s,%s,%s,%s);""" % (str(id), companyName, companyType,examineDate, liscense, "tianyacha",webSite,webName)
    try:
        csor.execute(
            """insert ignore into com_base (id,companyName,companyType,examineDate,liscense,source,webSite,webName) values (%s,%s,%s,%s,%s,%s,%s,%s);""",
            (str(id), companyName, companyType, examineDate, liscense,
             "tianyacha", webSite, webName))
        conn.commit()
    except Exception as e:
        #     # 发生错误时回滚
        print e

Exemple #3

0

Afficher le fichier

def koolearn(muluUrl, stage):

    global conn, csor
    if not conn or (not csor):
        conn, csor = getTmathConnCsor()

    muluHtmlContent = getContentWithUA(muluUrl, defaultPCUa)
    muluSoup = getSoupByStr(muluHtmlContent)
    for pageLi in muluSoup.select('.list01 ul li'):
        try:
            title = pageLi.select_one('h3').get_text()
            if u'下载' in title:
                continue
            descTag = pageLi.select_one('.js2 p')
            if not descTag:
                descTag = pageLi.select_one('.js p')
            desc = descTag.get_text()

            tags = pageLi.select_one('.c_lv')['title']
            ntype = tags  #从标签中选择一个具有代表性的作为类型，一般为第二个
            if len(tags) > 3:
                ts = tags.split(' ')
                if len(ts) > 2:
                    ntype = ts[1]
            contentUrl = pageLi.select_one('h3 a')['href']

            kooleanStartByContentUrl(conn, contentUrl, csor, desc, ntype,
                                     stage, tags, title)
        except Exception as ee:
            print traceback.format_exc()

    #获取下一页
    footLinks = muluSoup.select('#page a')
    nextUrl = footLinks[len(footLinks) - 1]['href']
    koolearn(urlparse.urljoin(muluUrl, nextUrl), stage)

Exemple #4

0

Afficher le fichier

Fichier : shuqiComment.py Projet : zyq001/pycrawler

def getBookCommentList(sqbookId):
    baseUrl = 'http://api1.shuqireader.com/reader/bc_bbs_interface.php?bid='
    midUrl = '&bbs=see&bbs_num=20&bbs_rand_num='
    commentList = []
    for i in range(1, 4):
        url = baseUrl + str(sqbookId) + midUrl + str(i) + capListAPIDeviceInfo
        commentText = getContentWithUA(url, ua)
        if not (commentText and len(commentText) > 30):
            print 'cap content too short '
            break
        capRoot = ElementTree.fromstring(commentText.encode('utf-8'))
        for comment in capRoot.getiterator('Bbs'):
            commentList.append(comment.attrib)
            # BbsId = comment.attrib['BbsId']
            # BbsIdUserName = comment.attrib['BbsIdUserName']
            # BbsIdUserId = comment.attrib['BbsIdUserId']
            # BbsContent = comment.attrib['BbsContent']
            # BbsTime = comment.attrib['BbsTime']
            #
            # commentObj = json.dumps(comment.attrib)
            #
            # BbsId = comment.attrib['BbsId']
            # BbsId = comment.attrib['BbsId']
            # BbsId = comment.attrib['BbsId']
            # BbsId = comment.attrib['BbsId']
    return commentList

Exemple #5

0

Afficher le fichier

Fichier : jyeoo.py Projet : zyq001/pycrawler

def fromPapper():

    papperListUrl = 'http://api.jyeoo.com/v1/math2/report?b=0&s=0&g=%s&t=0&e=0&r=&y=0&x=&pi=%s&ps=20&po=2'
    papperDetailBaseUrl = 'http://api.jyeoo.com/v1/math2/report/%s?ia=false'
    questDetailUrl = 'http://api.jyeoo.com/math2/AppTag/GetQues/%s'

    AuthCode = 'Token 9F5BBF8F752F060B00D38F7C81686852695A463CD5661FE0848CBEADB3ACFD5EE96B0D3FB81C8FEB1' \
               'EEC4F7CDA82D9DEE1603C3FED9A10DCD04FF9A5D4A677589F8891C0CA24ECB55A50EA11FFE8AA1B6F389D23A42B' \
               '46E9529444F65FC72870C19AA1299F39C3809B3FB1C8D12B4C5E179FF3DA7ADB9AF5F8D40C95FC5418FE30CE3D884A' \
               '52DA1CCC9AAB43AC1DCC501FBE1936820E5D73'

    Cookie = 'jyean=5x1Tibi9YuqrgS_gvFCeIr04zbNLyVJh9OK3GtKXzwblw1fjXNmqyzh5Facz4VuQKP20e1BJjK' \
             'PgCqVHWiQ7nlBmyQoYE3JEwEFmlp60djjncxNj2m4iSwj8YnOHXa0p0;jy=6B882BD2C4626BDCBD' \
             '1A156DAEF3B6A149459A79829709858932A0831D669F2E504EB7C714F40360013A05356D0F284759128FC' \
             '09556AA4C66DB25AF5F6C5E8CA16BB1D5261C4B4C74C002D90BE6C0103D6B80DC270249B19D933EFED5E85651E2817' \
             '5AD8FDE7C21D6373B64C8276A4E25D88987C37AC54A91A8A44888540B25163F05F330F8E7A88991394DAFE124159DE8407' \
             'C8256AE9AE7CE4C8937AF95418BA780A2AEB99EF452B60E765A607BDCF94CF605D5D3BD058E9BE846875E6C2E2A587BE80' \
             '55436E5FD290661F6F3FEB41EF00CA118E16E13B42F509ED690F7038DA498DA9EA0A39E5F6A377E409A5230CA67C9B7C' \
             'A9A00B3356D77346878D2B78188D1F3D17F48619D51D6C9158C6491C96423357206B7BDF1FFD7A2C4A34C334F8EE97ED' \
             '32FE7E075315375AAACDEC9B8AA17AF3F367827930B803BD060A685F8693E318F7782663D9C18F84753229011B6D' \
             '356BD26835F31CAD0F65B1DE78D915FE08D2FBEA480574BF9431C2DF9AD;'
    header = dict()
    header['Authorization'] = AuthCode
    header['Cookie'] = Cookie

    for g in range(1,13):
        for pageSize in range(0, 150):
            pListContent = getContentWithUA(papperListUrl % (g,pageSize), headers=header)
            if pListContent:
                pListJson = json.loads(pListContent)
                for papper in pListJson['Data']:
                    ppId = papper['ID']

                    papperDetailContent = getContentWithUA(papperDetailBaseUrl % ppId, headers=header)
                    if not papperDetailContent:
                        print 'papper detail failed continue'
                        continue
                    papperDetailJson = json.loads(papperDetailContent)

                    ppTitle = papperDetailJson['Title']
                    Score = papperDetailJson['Score']
                    SchoolName = papperDetailJson['SchoolName']
                    Degree = papperDetailJson['Degree']

                    for partJson in papperDetailJson['Groups']:
                        partName = partJson['Key']

Exemple #6

0

Afficher le fichier

Fichier : shuqi2.py Projet : zyq001/pycrawler

def startFromCId(p, queue):
    baseUrl = 'http://api.shuqireader.com/reader/bc_storylist.php?pagesize=40&PageType=category&item=allclick&pageIndex='
    cc = '&cid='
    page = 1
    shuqCategory = loadShuQSeqC()
    shuqCategory2 = loadShuQC()
    totleSize = 220

    for cid in shuqCategory.keys():

        try:

            url = baseUrl + str(page) + cc + str(cid) + capListAPIDeviceInfo

            urlContent = getContentWithUA(url, ua)

            if not (urlContent and len(urlContent) > 30):
                continue

            capRoot = ElementTree.fromstring(urlContent.encode('utf-8'))

            totleSize = int(capRoot.attrib['TotalCount']) / 40 + 1

            try:
                dealBookListUrlContentMT(p, queue, shuqCategory2, urlContent)

                for page in range(totleSize, 0, -1):
                    url = baseUrl + str(page) + cc + str(
                        cid) + capListAPIDeviceInfo

                    urlContent = getContentWithUA(url, ua)

                    if not (urlContent and len(urlContent) > 30):
                        continue

                    dealBookListUrlContentMT(p, queue, shuqCategory2,
                                             urlContent)
            except Exception as e1:
                print 'deal one page error, cid: ', cid, ' page: ', page
        except Exception as e:
            print "cid : ", cid, 'error: ', e

Exemple #7

0

Afficher le fichier

Fichier : shuqi2.py Projet : zyq001/pycrawler

def getContentFromXml(bookId, capId, xml):
    pageIndex = 1
    capListAPIBase = 'http://api.shuqireader.com/reader/bc_showchapter.php?bookId='

    # capApi = capListAPIBase + str(bookId) + '&chapterid=' + str(capId) + '&pageIndex=' + str(
    #     pageIndex) + '&bg=0' + capListAPIDeviceInfo
    # capText = getContentWithUA(capApi, ua)

    # if not (capText and len(capText) > 30):
    #     print 'cap content too short ,skip and del book'
    #     delBookById(bookId)
    #     return None
    capRoot = ElementTree.fromstring(xml.encode('utf-8'))

    # ChapterName = ''
    # if len(capRoot.getiterator('ChapterName')) > 0:
    #     ChapterName = capRoot.getiterator('ChapterName')[0].text

    ChapterContent = ''
    if len(capRoot.getiterator('ChapterContent')) > 0:
        ChapterContent = capRoot.getiterator('ChapterContent')[0].text

    # if ('http://' in ChapterContent and len(ChapterContent) < 250):
    #     print 'cap content is url ,skip and del book', bookId, ' : ', ChapterContent
    #     delBookById(bookId)
    #     return None
    WordsCount = ''
    # if len(capRoot.getiterator('WordsCount')) > 0:
    #     WordsCount = capRoot.getiterator('WordsCount')[0].text

    PageCount = 1
    if len(capRoot.getiterator('PageCount')) > 0:
        PageCount = int(capRoot.getiterator('PageCount')[0].text)
    if PageCount > 1:
        for i in range(2, PageCount + 1):
            pageIndex = i
            capApi2 = capListAPIBase + str(bookId) + '&chapterid=' + str(
                capId) + '&pageIndex=' + str(
                    pageIndex) + '&bg=0' + capListAPIDeviceInfo
            capText2 = getContentWithUA(capApi2, ua)

            if not (capText2 and len(capText2) > 160):
                return
            capRoot2 = ElementTree.fromstring(capText2.encode('utf-8'))
            ChapterContent2 = ''
            if len(capRoot2.getiterator('ChapterContent')) > 0:
                ChapterContent2 = capRoot2.getiterator(
                    'ChapterContent')[0].text
            ChapterContent = ChapterContent + ChapterContent2

    return ChapterContent

Exemple #8

0

Afficher le fichier

Fichier : easouCrawl.py Projet : zyq001/pycrawler

def updateByCategoryIdZongheng(catId):
    sql = 'SELECT id,rawUrl,digest from cn_dushu_book  where categoryCode = ' + str(
        catId) + ' and rawUrl like "%zongheng%" ORDER BY id desc;'
    try:
        csor.execute(sql)
        conn.commit()
    except Exception as e:
        #     # 发生错误时回滚
        print 'mysql ex: ', e
    results = csor.fetchall()
    for book in results:
        bid = book[0]
        url = book[1]
        bookDigest = book[2]
        deleteCapsByBookId(bid)

        url = url.replace('com/book', 'com/showchapter')

        content = getContentWithUA(url, ua)

        soup = getSoupByStr(content)

        caps = soup.select('.chapterBean')
        if not caps:
            continue
        for i in range(0, len(caps)):
            cap = caps[i]
            capUrl = cap.select('a')[0]['href']
            capName = cap.select('a')[0].get_text()
            content, host = getAndParse(capUrl)
            if not content:
                continue
            capObj = dict()
            capObj['title'] = capName
            capObj['rawUrl'] = capUrl
            capObj['source'] = '纵横'
            capObj['content'] = content
            capObj['bookId'] = bid
            capObj['idx'] = i

            m2 = hashlib.md5()
            forDigest = capName + u'#' + str(i)
            # forDigest = u'总裁我很忙#jxj季'
            m2.update(forDigest.encode('utf-8'))
            digest = m2.hexdigest()

            capObj['digest'] = digest
            capObj['size'] = len(content)
            capObj['bookUUID'] = bookDigest

            insertCapWithCapObj(capObj, conn, csor)

Exemple #9

0

Afficher le fichier

Fichier : shuqi2.py Projet : zyq001/pycrawler

def startFromLatestAjax():
    baseUrl = 'http://ajax.shuqiapi.com/?bamp=sqphcm&desc_type=3&page='
    tailUrl = '&tk=NDE3YWM1OWU5Zg%253D%253D'
    page = 1
    import json
    for page in range(86, 120):
        url = baseUrl + str(page) + tailUrl

        jsonContent = getContentWithUA(url, ua)
        jsonC = json.loads(jsonContent.encode('utf-8'))

        for book in jsonC['data']['ph']['book_list']:
            bookId = book['id']
            try:
                start(bookId)
            except Exception as e:
                print 'book ', bookId, ' error: ', e

Exemple #10

0

Afficher le fichier

Fichier : shuqi2.py Projet : zyq001/pycrawler

def initCap():
    sqCat = dict()
    for i in range(0, 800):
        url = 'http://api.shuqireader.com/reader/bc_storylist.php?pagesize=40&PageType=category&item=allclick&pageIndex=1&cid=' \
              + str(i) + capListAPIDeviceInfo
        text = getContentWithUA(url, ua)

        if not (text and len(text) > 60):
            continue

        root = ElementTree.fromstring(text.encode('utf-8'))

        # 获取element的方法
        # 1 通过getiterator
        node = root.getiterator("Book")[0]

        parentName = node.attrib['ParentTypeName']
        ParentTypeId = node.attrib['ParentTypeId']
        TypeName = node.attrib['TypeName']

        print TypeName, ParentTypeId, parentName

        tag = dict()
        tag['TypeName'] = TypeName
        # tag['parentName'] = parentName
        # tag['ParentTypeId'] = ParentTypeId
        tag['cid'] = i

        if not sqCat.has_key(parentName):

            chidren = []
            chidren.append(tag)
            top = dict()
            top['id'] = ParentTypeId
            top['children'] = chidren
            sqCat[parentName] = top
        else:
            sqCat[parentName]['children'].append(tag)
        # sqCat[i] = tag

    f = open('shuqCategory.yaml', 'wb')
    yaml.dump(sqCat, f)
    f.close()

Exemple #11

0

Afficher le fichier

Fichier : qichacha.py Projet : cash2one/pycrawler

def searchAndCrawlByName(comName, proxy=None):
    if not comName:
        return None
    comName = comName.encode('utf-8')
    # baseUrl = 'http://www.qichacha.com/search?key=' + quote(comName)
    # baseUrl = 'http://www.qichacha.com/firm_CN_ea3a783f0c010fc31a2d75c2c9aa9b75'
    baseUrl = 'http://www.qichacha.com/firm_c3ece65bad28c17cc7f67168448e50e1.shtml'
    ua = random.choice(USER_AGENTS)
    htmlContent = getContentWithUA(baseUrl, ua, proxy=proxy)
    if not htmlContent:
        return None
    soup = getSoupByStrEncode(htmlContent)
    if not soup.select('ul.list-group a') or len(
            soup.select('ul.list-group a')) < 1:
        print htmlContent
        return None
    for uidTag in soup.select('ul.list-group a'):
        uid = uidTag['href'].replace('firm_', '')
        if uid == uidTag['href']:
            print 'not uid, skip', uidTag['href']
            continue

        uid = uid.replace('.shtml', '').replace('/', '')

        prv = None
        if '_' in uid:
            strs = uid.split('_')
            prv = strs[0]
            uid = strs[1]
        # comName = uidTag.select_one('.text-lg').get_text()
        # comObj = dict()
        # comObj['uid'] = uid
        # comObj['comName'] = comName

        try:
            insertWithUid(conn, csor, prv, uid)
        except Exception as e:
            print 'insert with uid fail, uid:', uid
        # print comLink
    return 'ok'

Exemple #12

0

Afficher le fichier

Fichier : qichacha.py Projet : cash2one/pycrawler

def getInvestListByNameId(quid, qCname):

    if quid in investBloom:
        print 'invest aready done before, uid:', quid
        return None

    url = 'http://www.qichacha.com/company_getinfos?unique=' + quid + '&companyname=' + quote(
        qCname.encode('utf-8')) + '&tab=touzi'
    resList = []
    htmlContent = getContentWithUA(url, ua)
    soup = getSoupByStrEncode(htmlContent)

    for uidTag in soup.select_one('.list-group-item'):
        uid = uidTag['href'].replace('firm_', '').replace('.shtml',
                                                          '').replace('/', '')
        prv = None
        if '_' in uid:
            strs = uid.split('_')
            prv = strs[0]
            uid = strs[1]
        comName = uidTag.select_one('.text-lg').get_text()
        comObj = dict()
        comObj['uid'] = uid
        comObj['comName'] = comName

        insertWithUid(conn, csor, prv, quid)

        getInvestListByNameId(uid, comName)  #递归下去

        resList.append(comObj)

    # insertWithUid(conn,csor,None,quid)

    #入库
    if len(resList) < 1:
        #没有投资记录
        insertInvestList(quid, '')

    return resList

Exemple #13

0

Afficher le fichier

Fichier : shuqi2.py Projet : zyq001/pycrawler

def getCapContentObj(bookId, capId, mysqlBKid):
    pageIndex = 1
    capListAPIBase = 'http://api.shuqireader.com/reader/bc_showchapter.php?bookId='

    capApi = capListAPIBase + str(bookId) + '&chapterid=' + str(
        capId) + '&pageIndex=' + str(
            pageIndex) + '&bg=0' + capListAPIDeviceInfo
    capText = getContentWithUA(capApi, ua)

    if not (capText and len(capText) > 30):
        print 'cap content too short ,skip and del book'
        # delBookById(mysqlBKid)
        return None
    capRoot = ElementTree.fromstring(capText.encode('utf-8'))

    ChapterName = ''
    if len(capRoot.getiterator('ChapterName')) > 0:
        ChapterName = capRoot.getiterator('ChapterName')[0].text

    ChapterContent = ''
    if len(capRoot.getiterator('ChapterContent')) > 0:
        ChapterContent = capRoot.getiterator('ChapterContent')[0].text
    if not ChapterContent:
        capText = getContentWithUA(capApi, ua)

        if not (capText and len(capText) > 30):
            print 'cap content too short ,skip and del book'
            # delBookById(mysqlBKid)
            return None
        capRoot = ElementTree.fromstring(capText.encode('utf-8'))

        ChapterName = ''
        if len(capRoot.getiterator('ChapterName')) > 0:
            ChapterName = capRoot.getiterator('ChapterName')[0].text

        ChapterContent = ''
        if len(capRoot.getiterator('ChapterContent')) > 0:
            ChapterContent = capRoot.getiterator('ChapterContent')[0].text
    if not ChapterContent:
        return None
    ChapterContent = ChapterContent.strip()

    if (ChapterContent.startswith('http') and len(ChapterContent) < 250):
        print 'cap content is url ,skip and del book', bookId, ' : ', ChapterContent
        delBookById(mysqlBKid)
        return None
    WordsCount = ''
    if len(capRoot.getiterator('WordsCount')) > 0:
        WordsCount = capRoot.getiterator('WordsCount')[0].text

    PageCount = 1
    if len(capRoot.getiterator('PageCount')) > 0:
        PageCount = int(capRoot.getiterator('PageCount')[0].text)
    if PageCount > 1:
        for i in range(2, PageCount + 1):
            pageIndex = i
            capApi2 = capListAPIBase + str(bookId) + '&chapterid=' + str(
                capId) + '&pageIndex=' + str(
                    pageIndex) + '&bg=0' + capListAPIDeviceInfo
            capText2 = getContentWithUA(capApi2, ua)

            if not (capText2 and len(capText2) > 160):
                return
            capRoot2 = ElementTree.fromstring(capText2.encode('utf-8'))
            ChapterContent2 = ''
            if len(capRoot2.getiterator('ChapterContent')) > 0:
                ChapterContent2 = capRoot2.getiterator(
                    'ChapterContent')[0].text
            ChapterContent = ChapterContent + ChapterContent2

    capObj = dict()
    capObj['content'] = ChapterContent.replace(u'***求收藏***', '').replace(
        u'***（求收藏）***', '').replace(u'求收藏', '')
    capObj['title'] = ChapterName
    capObj['rawUrl'] = capApi[0:200]
    # capObj['size'] = int(WordsCount)
    capObj['size'] = len(capObj['content'])
    return capObj

Exemple #14

0

Afficher le fichier

def handleByMTID(mid):
    baseUrl = 'http://api.yingyangcan.com.cn/interface/ajax/book/getbaseinfo.ajax?contentid='
    capListBaseUrl = 'http://api.yingyangcan.com.cn/interface/ajax/book/getcatalog.ajax?contentid=' + str(mid) \
                     +'&pageindex=1&pagesize=100000000'
    capContentBaseUrl = 'http://api.yingyangcan.com.cn/interface/ajax/book/getcharpter.ajax?chapterindex='  #2&contentid=171117'
    bookDetailUrl = 'http://m.yingyangcan.com.cn/interface/template/content/book_detail.vhtml?id='
    url = baseUrl + str(mid)
    baseInfoContent = getContentWithUA(url, ua)
    if not baseInfoContent:
        baseInfoContent = getContentWithUA(url, ua)
    baseObj = json.loads(baseInfoContent)

    baseData = baseObj['data']
    author = baseData['author']
    title = baseData['name']
    coverUrl = baseData['coverUrl']
    contentUrl = baseData['contentUrl']
    count = baseData['count']
    isOver = baseData['isOver']
    BookType = '连载'
    if isOver == 1:
        BookType = '完结'

    bookDetailHtml = getContentWithUA(bookDetailUrl + str(mid), ua)
    bookDetailSoup = getSoupByStr(bookDetailHtml)
    bookDesc = bookDetailSoup.select_one('#J-desc').get_text().replace(
        '\n', '').replace('\t\t', '\t')

    bookObj = dict()
    bookObj['subtitle'] = bookDesc
    bookObj['source'] = "" + str(mid)
    bookObj['rawUrl'] = url
    bookObj['title'] = title
    bookObj['chapterNum'] = count
    bookObj['imgUrl'] = coverUrl
    bookObj['author'] = author
    bookObj['size'] = count * 1000
    bookObj['category'] = '仙侠'
    bookObj['type'] = '重生'

    bookObj['bookType'] = BookType

    bookObj['typeCode'] = 4
    bookObj['categoryCode'] = 1

    bookObj['viewNum'] = random.randint(500000, 1000000)

    m2 = hashlib.md5()
    forDigest = title + u'#' + author
    m2.update(forDigest.encode('utf-8'))
    digest = m2.hexdigest()

    bookObj['digest'] = digest

    bookObj = insertBookWithConn(bookObj, conn2, csor2)

    # myBookId = bookObj['id']
    #
    for cid in range(1047, count + 1):

        capContentUrl = capContentBaseUrl + str(cid) + '&contentid=' + str(mid)
        capContent = getContentWithUA(capContentUrl, ua)
        if not capContent:
            capContent = getContentWithUA(capContentUrl, ua)
        capListJsonObj = json.loads(capContent)
        if not (capListJsonObj['status'] == 1000
                and capListJsonObj['message'] == u'成功'):
            capListJsonObj = json.loads(capContent)
            if not (capListJsonObj['status'] == 1000
                    and capListJsonObj['message'] == u'成功'):
                continue
        capObj = dict()
        orgContent = capListJsonObj['data']['chapter']
        contentSoup = getSoupByStr(orgContent)
        del contentSoup.body['style']
        content = unicode(contentSoup.body).replace(u'<body>', '').replace(
            u'</body>', '').replace(u'\n\n', u'\n').replace(
                u'<br><br>', u'<br>').replace(u'<br\><br\>', u'<br\>')
        capObj['content'] = content
        capObj['title'] = unicode(contentSoup.title.get_text())
        capObj['rawUrl'] = capContentUrl
        # capObj['size'] = int(WordsCount)
        capObj['size'] = len(content)
        capObj['bookId'] = bookObj['id']
        capObj['source'] = bookObj['source']
        capObj['idx'] = cid
        capObj['bookUUID'] = bookObj['digest']

        m2 = hashlib.md5()
        forDigest = bookObj['digest'] + capObj['title'] + u'#' + str(cid)
        m2.update(forDigest.encode('utf-8'))
        digest = m2.hexdigest()
        capObj['digest'] = digest

        capId = insertCapWithCapObj(capObj, conn2, csor2)
        if not capId:
            continue
        upload2Bucket(str(capObj['id']) + '.json', json.dumps(capObj))

Exemple #15

0

Afficher le fichier

def kooleanStartByContentUrl(conn,
                             contentUrl,
                             csor,
                             desc='',
                             ntype='',
                             stage='',
                             tags='',
                             title=''):
    detailHtmlContent = getContentWithUA(contentUrl, defaultPCUa)
    detailContentSoup = getSoupByStr(detailHtmlContent)
    detailContent = ''
    contentDiv = detailContentSoup.select_one('.show_l2 .mt40')
    contentDiv.select('p')[0].extract()  # 第一个p标签为介绍，删掉
    cps = contentDiv.select('p')
    for ci in range(0, len(cps)):
        if cps[ci].select('a'):
            print 'has link ,extract,  contentUrl:'
            cps[ci].extract()

        if ci in [len(cps) - 1, len(cps) - 2,
                  len(cps) - 3] and (u'新东方' in cps[ci].get_text()
                                     or u'来源' in cps[ci].get_text()):
            for cc in range(ci, len(cps)):
                cps[cc].extract()
            break
    detailContent = detailContent + unicode(contentDiv)
    # 如果有分页，不算最后一个回链页
    for page in range(2, 100):
        cUrl = contentUrl.replace('.html', '_' + str(page) + '.html')
        moreContentHtmlContent = getContentWithUA(cUrl, defaultPCUa)
        if not moreContentHtmlContent:
            print 'no more content, ', cUrl
            break
        moreContentSoup = getSoupByStr(moreContentHtmlContent)
        # 去掉最后两个p
        moreContentDiv = moreContentSoup.select_one('.show_l2 .mt40')
        pps = moreContentDiv.select('p')
        for ci in range(0, len(pps)):
            if pps[ci].select('a'):
                # print 'has link ,extract, link:',unicode(pps[ci]),' contentUrl:',cUrl
                print 'has link ,extract, link2:'
                pps[ci].extract()

            if ci in [len(pps) - 1, len(pps) - 2,
                      len(pps) - 3] and (u'新东方' in pps[ci].get_text()
                                         or u'来源' in pps[ci].get_text()):
                for cc in range(ci, len(pps)):
                    pps[cc].extract()
                break
        # pps[len(pps) - 1].extract()
        # pps[len(pps) - 2].extract()

        [a.unwrap() for a in moreContentDiv.select('a')]
        for img in moreContentDiv.select('img'):
            if not img.has_key('style') or len(img['style']) < 1:
                img['style'] = 'max-width:100%'
            else:
                preStyle = img['style']
                if preStyle.endswith(';'):
                    img['style'] = img['style'] + 'max-width:100%;'
                else:
                    img['style'] = img['style'] + ';max-width:100%'

        detailContent = detailContent + unicode(moreContentDiv)

    # 入库
    csor.execute(
        'insert ignore into daily_news_copy (name,type,content,stage,author,tag,contentUrl,description) VALUES (%s,'
        '%s,%s,%s,%s,%s,%s,%s)',
        (title, ntype, detailContent.replace(u'新东方在线论坛', '').replace(
            u'相关链接：',
            '').replace(u'来源：新东方在线论坛', '').replace(u'新东方在线', '').replace(
                u'新东方', ''), stage, u'新东方', tags, contentUrl, desc))
    conn.commit()

Exemple #16

0

Afficher le fichier

Fichier : shuqi2.py Projet : zyq001/pycrawler

def getShuqiCapList(bookId):

    capList = []

    pageIndex = 1
    capListAPIBase = 'http://api.shuqireader.com/reader/bc_chapter.php?pagesize=40&bookId='

    capListAPI = capListAPIBase + str(bookId) + '&pageIndex=' + str(
        pageIndex) + capListAPIDeviceInfo

    text = getContentWithUA(capListAPI, ua)

    if not (text and len(text) > 160):
        return

    root = ElementTree.fromstring(text.encode('utf-8'))

    gatherId = root.getiterator('BookInfos')[0].attrib['GatherId']
    TotalCount = int(root.getiterator('BookInfos')[0].attrib['TotalCount'])
    # if TotalCount > 40:
    topPageCount = TotalCount / 40 + 2  #分页的总数
    for i in range(1, topPageCount):  #如果没有分页，i只会等于1
        if i == 1:
            pageRoot = root
        else:
            pageApi = capListAPIBase + str(bookId) + '&pageIndex=' + str(
                i) + capListAPIDeviceInfo
            pageText = getContentWithUA(pageApi, ua)

            if not (pageText and len(pageText) > 160):
                return
            pageRoot = ElementTree.fromstring(pageText.encode('utf-8'))

        if gatherId and gatherId != '':  #有子目录
            for book in pageRoot.getiterator('Book'):
                vId = book.attrib['ChapterId']

                secondApi = capListAPI + '&vid=' + str(vId)  #子目录的url
                textSon = getContentWithUA(secondApi, ua)  #子目录的内容

                xmlSon = ElementTree.fromstring(
                    textSon.encode('utf-8'))  #子目录的xml

                sonTotalCount = int(
                    xmlSon.getiterator('BookInfos')
                    [0].attrib['TotalCount'])  #子目录的记录总数
                sonPageCount = sonTotalCount / 40 + 2  #子目录的分页总数
                for j in range(1, sonPageCount):  #遍历子目录的每一页，如果没有分页，会只遍历第一页
                    if j == 1:
                        sonpageRoot = xmlSon  #第一页不需要再请求url
                    else:
                        morePageApi = capListAPIBase + str(bookId) + '&pageIndex=' + str(j) + capListAPIDeviceInfo \
                                  + '&vid=' + str(vId)#子目录的分页url
                        morePageText = getContentWithUA(morePageApi, ua)
                        sonpageRoot = ElementTree.fromstring(
                            morePageText.encode('utf-8'))  # 子目录的xml

                    for realCap in sonpageRoot.getiterator('Book'):
                        realCapId = realCap.attrib['ChapterId']
                        # dealCap(bookId, realCapId)
                        capList.append(realCapId)

        else:  #没有二级目录，不需要请求信的api，所有不需考虑分页
            for realCap in pageRoot.getiterator('Book'):
                realCapId = realCap.attrib['ChapterId']
                # dealCap(bookId, realCapId)
                capList.append(realCapId)

    return capList

Exemple #17

0

Afficher le fichier

def today():
    baseUrl = 'http://www.todayonhistory.com/'
    conn, csor = getTmathConnCsor()
    for month in range(1, 13):
        for day in range(1, 32):
            type = '全部'
            jsonurl = baseUrl + str(month) + '/' + str(day)
            htmlContent = getContentWithUA(jsonurl, defaultPCUa)
            if not htmlContent or u'404-历史上的今天' in htmlContent:
                print 'no content skip month:', str(month), ' day:', str(day)
                continue
            soup = getSoupByStr(htmlContent)
            if '404' in soup.title:
                print '404 skip month:', str(month), ' day:', str(day)
                continue
            listUl = soup.select_one('ul.oh')
            for listLi in listUl.select('li'):
                liClasses = listLi['class']
                if 'typeid_53' in liClasses:
                    type = u'纪念'
                elif 'typeid_54' in liClasses:
                    type = u'节假日'
                elif 'typeid_55' in liClasses:
                    type = u'逝世'
                elif 'typeid_56' in liClasses:
                    type = u'出生'
                elif 'typeid_57' in liClasses:
                    type = u'事件'
                solarYear = listLi.select_one('span[class="poh"]').get_text()
                link = listLi.select_one('a')
                if not link:
                    print 'no link content, maybe bs4 bug, skip'
                    continue
                contentUrl = link['href']
                title = link['title']
                contentText = ''

                imgUrl = ''
                imgTag = listLi.select_one('img')
                if imgTag:
                    imgUrl = urlparse.urljoin(baseUrl, imgTag['src'])

                detailContentHtml = getContentWithUA(contentUrl, defaultPCUa)
                if detailContentHtml:
                    contentSoup = getSoupByStr(detailContentHtml)
                    contentBody = contentSoup.select_one('.body')

                    n1 = contentBody.select_one('.page')
                    if n1:
                        n1.extract()
                    n2 = contentBody.select_one('.keyword')
                    if n2:
                        n2.extract()
                    n3 = contentBody.select_one('.extra')
                    if n3:
                        n3.extract()
                    n4 = contentBody.select_one('.mgg')
                    if n4:
                        n4.extract()
                    n5 = contentBody.select_one('.poh')
                    if n5:
                        n5.extract()
                    n6 = contentBody.select_one('.framebox')
                    if n6:
                        n6.extract()

                    # for divTag in contentBody.select('div'):
                    #     divTag.extract()# 去除多余的div

                    contentText = unicode(contentBody)
                csor.execute(
                    'insert ignore into daily_today (name ,type ,content  '
                    ',month ,day ,thumbImg ,solaryear,srcUrl) VALUES (%s,%s,%s,%s,%s,%s,%s,%s)',
                    (title, type, contentText, month, day, imgUrl, solarYear,
                     contentUrl))
                conn.commit()

            jsonBaseUrl = 'http://www.todayonhistory.com/index.php?m=content&c=index&a=json_event&page='
            #&pagesize=40&month=2&day=13'
            for page in range(1, 5):
                jsonurl = jsonBaseUrl + str(
                    page) + '&pagesize=40&month=' + str(month) + '&day=' + str(
                        day)
                jsonContent = getContentWithUA(jsonurl, defaultPCUa)
                if not jsonContent or len(jsonContent) < 10:
                    print 'json url return null or too short, maybe finished'
                    break
                jsonLists = json.loads(jsonContent)
                for jsonObj in jsonLists:
                    tid = jsonObj['id']
                    contentUrl2 = jsonObj['url']
                    title = jsonObj['title']
                    thumb = urlparse.urljoin(baseUrl, jsonObj['thumb'])
                    solaryear = jsonObj['solaryear']

                    contentText = ''

                    detailContentHtml = getContentWithUA(
                        contentUrl2, defaultPCUa)
                    if detailContentHtml:
                        contentSoup = getSoupByStr(detailContentHtml)
                        contentBody = contentSoup.select_one('.body')
                        # for divTag in contentBody.select('div'):
                        #     divTag.extract()  # 去除多余的div

                        n1 = contentBody.select_one('.page')
                        if n1:
                            n1.extract()
                        n2 = contentBody.select_one('.keyword')
                        if n2:
                            n2.extract()
                        n3 = contentBody.select_one('.extra')
                        if n3:
                            n3.extract()
                        n4 = contentBody.select_one('.mgg')
                        if n4:
                            n4.extract()
                        n5 = contentBody.select_one('.poh')
                        if n5:
                            n5.extract()
                        n6 = contentBody.select_one('.framebox')
                        if n6:
                            n6.extract()
                        n7 = contentBody.select_one('.mad')
                        if n7:
                            n7.extract()

                        contentText = unicode(contentBody)
                    csor.execute(
                        'insert ignore into daily_today (name ,type ,content  '
                        ',month ,day ,thumbImg ,solaryear,srcUrl) VALUES (%s,%s,%s,%s,%s,%s,%s,%s)',
                        (title, '全部', contentText, month, day, thumb,
                         solaryear, contentUrl2))
                    conn.commit()

            print 'done month:', str(month), ' day: ', str(day)