コード例 #1
0
ファイル: easouCrawl.py プロジェクト: zyq001/pycrawler
def easouCrawl():
    catDict = loadCategory()
    shukuUrl = 'http://b.easou.com/w/booklib/all_c_l_2_0_1.html'  #已完结
    shukuHtml = getContent(shukuUrl)

    if not shukuHtml:
        return

    shukuSoup = getSoupByStr(shukuHtml)

    conds = shukuSoup.select('.condition')

    if not conds or len(conds) < 1:
        print 'shuku error, stop '
        return
    for cat in conds[1].select('.n a'):  #遍历大类的'全部'页

        catUrl = urlparse.urljoin(shukuUrl, cat['href'])

        topCatName = cat.get_text()

        booklistHtml = getContent(catUrl)  #大类的'全部'页
        if not booklistHtml:
            continue

        booklistSoup = getSoupByStr(booklistHtml)
        Catconds = booklistSoup.select('.condition')
        if not Catconds or len(Catconds) < 1:
            print 'category error, skip ,', cat.get_text()
            continue
        for tag in Catconds[2].select('.n a'):  # 先遍历大类下面的标签页

            tagUrl = urlparse.urljoin(catUrl, tag['href'])
            tagName = tag.get_text()

            if catDict.has_key(tagName):
                tagObj = catDict[tagName]
            else:
                tagObj = catDict[u'全部']

            dealTagPage(shukuUrl, tagName, tagObj, tagUrl)
            for n in range(2, 5):
                dealTagPage(shukuUrl, tagName, tagObj,
                            tagUrl.replace('1.html',
                                           str(n) + '.html'))

        tag2 = Catconds[2].select('.all a')
        tagUrl = urlparse.urljoin(catUrl, tag2['href'])
        tagName = u'其他' + tag2.get_text()
        if catDict.has_key(tagName):
            tagObj = catDict[tagName]
        else:
            tagObj = catDict[u'全部']

        dealTagPage(shukuUrl, tagName, tagObj, tagUrl)
        for n in range(2, 5):
            dealTagPage(shukuUrl, tagName, tagObj,
                        tagUrl.replace('1.html',
                                       str(n) + '.html'))
コード例 #2
0
ファイル: shuqi2.py プロジェクト: zyq001/pycrawler
def getBookObjFromSQid(id, shuqCategory):
    bookInfoAPI = 'http://api.shuqireader.com/reader/bc_cover.php?bookId=' + str(
        id
    ) + '&book=same&book_num=5&bbs=pinglun&bbs_num=8&bbs_rand_num=1&lastchaps=1&ItemCount=3&soft_id=1&ver=110817&platform=an&placeid=1007&imei=862953036746111&cellid=13&lac=-1&sdk=18&wh=720x1280&imsi=460011992901111&msv=3&enc=666501479540451111&sn=1479540459901111&vc=e8f2&mod=M3'
    text = getContent(bookInfoAPI)
    if not (text and len(text) > 160):
        return None, None
    root = ElementTree.fromstring(text.encode('utf-8'))
    BookType = ''
    if len(root.getiterator('BookType')) > 0:
        BookType = root.getiterator('BookType')[0].text
    category = ''
    if len(root.getiterator('NickName')) > 0:
        category = root.getiterator('NickName')[0].text
    tag = ''
    if len(root.getiterator('ShortNickName')) > 0:
        tag = root.getiterator('ShortNickName')[0].text
    tagId = 0
    if root.getiterator('NickId') and len(root.getiterator(
            'NickId')) > 0 and root.getiterator('NickId')[0].text:
        tagId = int(root.getiterator('NickId')[0].text)
    firstCid = 0
    if root.getiterator('ChapteridFirst') and len(
            root.getiterator('ChapteridFirst')) > 0 and root.getiterator(
                'ChapteridFirst')[0].text:
        firstCid = int(root.getiterator('ChapteridFirst')[0].text)

    if (not BookType) and (not category) and (not tag) and (not tagId):
        return None, None
    categoryId = 0
    if shuqCategory.has_key(tag):
        if shuqCategory[tag]['id'] and len(shuqCategory[tag]['id']) > 0:
            categoryId = int(shuqCategory[tag]['id'])
    size = 1
    if root.getiterator('Size') and len(root.getiterator('Size')) > 0:
        strSize = root.getiterator('Size')[0].text
        size = sizeStr2Int(strSize)
    NumChapter = 1
    if root.getiterator('NumChapter') and len(root.getiterator(
            'NumChapter')) > 0 and root.getiterator('NumChapter')[0].text:
        NumChapter = int(root.getiterator('NumChapter')[0].text)
    source = 'shuqi' + str(id)
    subtitle = root.getiterator('Description')[0].text
    title = root.getiterator('BookName')[0].text
    author = root.getiterator('Author')[0].text
    imgurl = root.getiterator('ImageExists')[0].text
    certainBookUrl = 'http://api.shuqireader.com/reader/bc_cover.php?bookId=' + str(
        id)
    if not title or len(title) < 1 or len(author) < 1:
        return None, None
    bookObj = dict()
    bookObj['subtitle'] = subtitle
    bookObj['source'] = source
    bookObj['rawUrl'] = certainBookUrl
    bookObj['title'] = title
    bookObj['chapterNum'] = NumChapter
    bookObj['imgUrl'] = imgurl
    bookObj['author'] = author
    bookObj['size'] = size
    bookObj['category'] = tag
    bookObj['type'] = category
    bookObj['bookType'] = BookType
    # bookObj['typeCode'] = 100 + tagId
    # bookObj['categoryCode'] = 100 + categoryId
    bookObj['typeCode'] = tagId
    bookObj['categoryCode'] = categoryId
    bookObj['firstCid'] = firstCid
    bookObj['viewNum'] = 0

    m2 = hashlib.md5()
    forDigest = title + u'#' + author
    m2.update(forDigest.encode('utf-8'))
    digest = m2.hexdigest()
    bookObj['digest'] = digest
    return bookObj, digest
コード例 #3
0
ファイル: easouCrawl.py プロジェクト: zyq001/pycrawler
def dealTagPage(shukuUrl, tagName, tagObj, tagUrl):
    tagBooksHtml = getContent(tagUrl)
    if not tagBooksHtml:
        return
    tagBooksSoup = getSoupByStr(tagBooksHtml)
    for book in tagBooksSoup.select('.listcontent .name a'):
        t = random.randint(60, 100)
        time.sleep(t)
        bookUrl = urlparse.urljoin(tagUrl, book['href'])  # 是图书搜索结果页
        bookName = book.get_text()

        bookObj = dict()  # 建图书对象,共函数间传递
        bookObj['title'] = bookName
        bookObj['type'] = tagName
        bookObj['typeCode'] = tagObj['typeCode']
        bookObj['categoryCode'] = tagObj['categoryCode']
        bookObj['category'] = tagObj['category']

        # bookId, bookUUID = insertBook(bookObj) #Book信息入库,获得bookid和uuid

        bookMidHtml = getContent(bookUrl)
        if not bookMidHtml:
            continue
        bookMidSoup = getSoupByStr(bookMidHtml)

        lis = bookMidSoup.select('.resultContent li')

        if not lis or len(lis) < 1:
            print 'get book result list error, skip, book: ', bookUrl, ' tagUrl: ', tagUrl
            continue

        bookLi = lis[0]

        bookImgs = bookLi.select('.imgShow')[0].select('img')
        if bookImgs and len(bookImgs) > 0:
            imgsrc = bookImgs[0]['src']
            bookImg = urlparse.urljoin(bookUrl, imgsrc).replace(
                'http://b.easou.com/w/resources/imgs/pic.gif', DefaultImg)

        else:
            bookImg = DefaultImg

        certainBookUrl = urlparse.urljoin(bookUrl,
                                          bookLi.select('.name a')[0]['href'])
        author = bookLi.select('.author a')[0].get_text()
        count = bookLi.select('.count')[0].get_text().replace(u'追书人数:',
                                                              '').replace(
                                                                  u'人追', '')
        bookObj['author'] = author
        bookObj['viewNum'] = count
        bookObj['imgUrl'] = bookImg

        # 具体图书介绍页
        certainBookHtml = getContent(certainBookUrl)
        if not certainBookHtml:
            continue
        certainBookSoup = getSoupByStr(certainBookHtml)

        subtitle = certainBookSoup.select('.desc')[0].get_text()
        source = certainBookSoup.select('.source .t')[0].get_text()

        bookObj['subtitle'] = subtitle
        bookObj['source'] = source
        bookObj['rawUrl'] = certainBookUrl

        agendaLinks = certainBookSoup.select('.dao .category a')

        if not agendaLinks or len(agendaLinks) < 1:
            print 'get book agenda list error, skip, book: ', bookUrl, ' tagUrl: ', tagUrl
            continue

        # 图书目录页
        agendaHtml = getContent(
            urlparse.urljoin(certainBookUrl, agendaLinks[0]['href']))
        if not agendaHtml:
            continue
        agendaSoup = getSoupByStr(agendaHtml)
        # 终于达到了目录页

        caplist = list()

        caps = agendaSoup.select('.category li a')

        for i in range(0, len(caps)):
            cap = caps[i]
            capUrl = urlparse.urljoin(shukuUrl, cap['href'])
            capName = cap.get_text()
            capObj = {}
            capObj['url'] = capUrl
            capObj['name'] = capName
            caplist.append(capObj)

        pages = agendaSoup.select('.pager a')

        if pages and len(pages) > 0:
            for j in range(0, len(pages) - 1):
                pageA = pages[j]
                nextPageUrl = urlparse.urljoin(bookUrl, pageA['href'])
                # 图书目录页
                agendaHtml2 = getContent(nextPageUrl)

                if not agendaHtml2:
                    continue
                agendaSoup2 = getSoupByStr(agendaHtml2)
                # 终于达到了目录页

                caps = agendaSoup2.select('.category li a')

                for i in range(0, len(caps)):
                    cap = caps[i]
                    capUrl = urlparse.urljoin(shukuUrl, cap['href'])
                    capName = cap.get_text()

                    capObj = {}
                    capObj['url'] = capUrl
                    capObj['name'] = capName

                    caplist.append(capObj)

        bookObj['chapterNum'] = len(caplist)
        # bookAreadyCrawled = insertBook(bookObj)
        # if not bookAreadyCrawled:
        #     checkCapsSql = 'select count(*) from cn_dushu_article where bookId = %d' % (bookObj['id'])
        #     try:
        #         csor.execute(checkCapsSql)
        #         conn.commit()
        #         results = csor.fetchall()
        #
        #         if not results or len(results) < 1:
        #             return None
        #         else:
        #             bookObj['id'] = results[0][0]
        #     except Exception as e:
        #         #     # 发生错误时回滚
        #         print 'check cap count failed ,skip', e

        if not insertBook(bookObj):  # Book信息入库,bookid和uuid写入bookObj
            print 'error, skip, bookName', bookObj['title']
            continue

        bookId = bookObj['id']
        existsCaps = getExistsCaps(bookId)

        for m in range(0, len(caplist)):

            if existsCaps and len(existsCaps) > 0:
                noNeedCrawlCap = False
                for cap in existsCaps:
                    if cap[0] == m and cap[1] > 300:
                        noNeedCrawlCap = True
                        break
                if noNeedCrawlCap:
                    print 'cap exists, no need to recrawl, bookName', bookObj[
                        'title'], ' bookId', bookId, ' capIdex: ', m
                    continue

            capUrl = caplist[m]['url']
            capName = caplist[m]['name']
            p.apply_async(insertCap, args=(bookObj, capUrl, capName, m, queue))
コード例 #4
0
def juren():
    csor, conn = getConn()

    #小升初笑话
    for i in range(1, 25):
        #笑话
        # url = 'http://aoshu.juren.com/chzt/shuxueshouchaobao/xiaoxiaohua/index_' + str(i) + '.html'
        #故事
        url = 'http://aoshu.juren.com/tiku/mryt/yimryt/index_' + str(
            i) + '.html'
        #名人
        # url =
        if i == 1:
            # url = 'http://aoshu.juren.com/chzt/shuxueshouchaobao/xiaoxiaohua/'
            # url = 'http://aoshu.juren.com/chzt/shuxueshouchaobao/xiaogushi/'
            # url = 'http://aoshu.juren.com/chzt/shuxueshouchaobao/neirongsucai/'
            url = 'http://aoshu.juren.com/tiku/mryt/yimryt/'
        content = getContent(url)
        if not content:
            print 'get content failed, url: ', url
            continue
        soup = getSoupByStr(content)
        if not soup:
            print 'get soup filed, url:', url
            continue
        for listting in soup.select(".listing1"):
            for a in listting.select('a'):
                text = a.get_text()
                titles = text.split(u':')
                if len(titles) < 2:
                    titles = text.split(u':')
                if len(titles) < 2:
                    title = text
                else:
                    title = titles[1]
                deatilUrl = a['href']
                contentHtml = getContent(deatilUrl)
                if not contentHtml:
                    print 'get detail failed'
                    continue
                contentSoup = getSoupByStr(contentHtml).select('.mainContent')
                content = ''
                ps = contentSoup[0].select('p')
                length = len(ps)
                for j in range(1, length):
                    pJ = ps[j]
                    pText = pJ.get_text()
                    if u'本期精彩专题推荐' in pText or u'本期' in pText or u'精彩推荐' in pText\
                            or u'点击下一页查看答案' in pText or u'下一页查看答案' in pText or u'查看答案' in pText\
                            or len(pJ.select('a')) > 0:
                        print 'not content,break,  text:' + pText
                        break
                    content += unicode(pJ)
                contentHtml2 = getContent(deatilUrl.replace(
                    '.html', '_2.html'))
                if not contentHtml2:
                    print 'get detail failed'
                    continue
                # contentSoup2 = getSoupByStr(contentHtml2.replace('<br /></p>','')).select('.mainContent')
                contentSoup2 = getSoupByStr(contentHtml2).select(
                    '.mainContent')
                ps = contentSoup2[0].select('p')
                length = len(ps)
                for j in range(0, length):
                    pJ = ps[j]
                    pText = pJ.get_text()
                    if u'本期精彩专题推荐' in pText or u'本期' in pText or u'精彩推荐' in pText or len(
                            pJ.select('a')) > 0:
                        print 'not content,break,  text:' + pText
                        break
                    content += unicode(pJ)

                sql = "INSERT ignore INTO daily(name, \
                                        type, content,stage, gred) \
                                        VALUES ('%s', '%d', '%s', '%s', '%d')"                                                                               % \
                      (title, 3, content, '3', 1)
                try:
                    # 执行sql语句
                    print sql
                    csor.execute(sql)
                    # 提交到数据库执行
                    print conn.commit()
                except:
                    # 发生错误时回滚
                    conn.rollback()
    conn.close()
コード例 #5
0
    results = csor.fetchall()

    lastTime = 0

    for row in results:
        # content = row[1]
        # content = row[4].replace('mi', 'mo')
        id = row[0]
        # url = row[1]
        url = 'http://www.3dllc.com/html/37/37023/9515879.html'

        # if not u'easou' in url:
        #     continue

        newContent = getContent(url)

        doc = Document(newContent)

        content = doc.summary(html_partial=True)

        #
        # soup = getSoupByStr(newContent)
        #
        # ps = soup.select('#chapterContent')[0]
        # # ps.select('div')[0].unwrap()
        # # ps.unwrap()
        # for water in soup.select('.watermark'):
        #     water.extract()

        #