Python getAndParse Examples, executor.getAndParse Python Examples

Example #1

0

Show file

File: easouCrawl.py Project: zyq001/pycrawler

def insertCap(bookObj, capUrl, capName, idx, queue):

    title = capName
    rawUrl = capUrl
    content, source = getAndParse(capUrl)
    if not (content and source):
        print 'no content got, fill with temp ,capUrl : ', capUrl

        content = '暂缺，请稍后再来'
        source = ''

    bookId = bookObj['id']
    size = len(content)
    bookUUID = bookObj['digest']

    import hashlib

    m2 = hashlib.md5()
    forDigest = capName + u'#' + source
    m2.update(forDigest.encode('utf-8'))
    digest = m2.hexdigest()

    capObj = dict()
    capObj['title'] = title
    capObj['rawUrl'] = rawUrl
    capObj['source'] = source
    capObj['content'] = content
    capObj['bookId'] = bookId
    capObj['idx'] = idx
    capObj['digest'] = digest
    capObj['size'] = size
    capObj['bookUUID'] = bookUUID

    queue.put(capObj)

Example #2

0

Show file

File: easouCrawl.py Project: zyq001/pycrawler

def updateByCategoryIdZongheng(catId):
    sql = 'SELECT id,rawUrl,digest from cn_dushu_book  where categoryCode = ' + str(
        catId) + ' and rawUrl like "%zongheng%" ORDER BY id desc;'
    try:
        csor.execute(sql)
        conn.commit()
    except Exception as e:
        #     # 发生错误时回滚
        print 'mysql ex: ', e
    results = csor.fetchall()
    for book in results:
        bid = book[0]
        url = book[1]
        bookDigest = book[2]
        deleteCapsByBookId(bid)

        url = url.replace('com/book', 'com/showchapter')

        content = getContentWithUA(url, ua)

        soup = getSoupByStr(content)

        caps = soup.select('.chapterBean')
        if not caps:
            continue
        for i in range(0, len(caps)):
            cap = caps[i]
            capUrl = cap.select('a')[0]['href']
            capName = cap.select('a')[0].get_text()
            content, host = getAndParse(capUrl)
            if not content:
                continue
            capObj = dict()
            capObj['title'] = capName
            capObj['rawUrl'] = capUrl
            capObj['source'] = '纵横'
            capObj['content'] = content
            capObj['bookId'] = bid
            capObj['idx'] = i

            m2 = hashlib.md5()
            forDigest = capName + u'#' + str(i)
            # forDigest = u'总裁我很忙#jxj季'
            m2.update(forDigest.encode('utf-8'))
            digest = m2.hexdigest()

            capObj['digest'] = digest
            capObj['size'] = len(content)
            capObj['bookUUID'] = bookDigest

            insertCapWithCapObj(capObj, conn, csor)

Example #3

0

Show file

File: easouCrawl.py Project: zyq001/pycrawler

def updateByBookId(id):
    res = getExistsCapsRawUrlId(int(id))
    if not res:
        return
    for cap in res:
        cid = cap[0]
        url = cap[1]
        if not url or len(url) < 1:
            print cid, 'no url, skipp'
            break
        content, host = getAndParse(url)
        if not content:
            continue
        updateContentById(cid, content)

Example #4

0

Show file

File: shuqi2.py Project: zyq001/pycrawler

def updateCapFromTo(f, t):

    print 'from', str(f), ' to ', str(t)

    offset = 100

    begin = f
    end = begin + offset
    while end <= t:
        # sql = "select id, rawUrl,bookId,content from cn_dushu_acticle where id >= %d and id < %d" % (begin, end)
        try:
            csor2.execute(
                "select id, rawUrl,bookId,content from cn_dushu_acticle where id >= %d and id < %d",
                (begin, end))
            conn2.commit()
        except Exception as e:
            #     # 发生错误时回滚
            print 'mysql ex: ', e

        begin = begin + offset
        end = end + offset

        results = csor2.fetchall()
        for cap in results:
            cid = cap[0]
            capUrl = cap[1]
            bookId = cap[2]
            unclearContent = cap[3]
            if not (u'        言情小说_打造最新原创' in unclearContent
                    or unclearContent == 'None'):
                continue
            try:
                if not capUrl or len(capUrl) < 1:
                    print 'no url, bookId : ', bookId
                if 'shuqireader' in capUrl:
                    content = getContentByUrl(capUrl)
                    # updateContentById(cid, content)
                else:
                    content, host = getAndParse(capUrl)
                    if not content:
                        continue
                updateContentById(cid, content)
            except Exception as e:
                print 'cid ', cid, 'error: ', e
            except ValueError as er:
                print 'cid ', cid, 'error: ', er

Example #5

0

Show file

File: shuqi2.py Project: zyq001/pycrawler

def handleCapUpload(cap):
    cid = cap[0]
    capUrl = cap[2]
    bookId = cap[5]
    unclearContent = cap[4]
    capObj = dict()
    capObj['id'] = cap[0]
    capObj['title'] = cap[1]
    capObj['rawUrl'] = cap[2]
    capObj['source'] = cap[3]
    capObj['content'] = cap[4]
    capObj['bookId'] = cap[5]
    capObj['idx'] = cap[6]
    capObj['digest'] = cap[7]
    capObj['size'] = cap[8]
    capObj['bookUUID'] = cap[9]
    content = unclearContent
    if unclearContent and not (u'        言情小说_打造最新原创' in unclearContent
                               or unclearContent == 'None'):
        upload2Bucket(str(cid) + '.json', json.dumps(capObj))
    else:
        try:
            if not capUrl or len(capUrl) < 1:
                print cid, 'no url, bookId : ', bookId
            else:
                if 'shuqireader' in capUrl:
                    content = getContentByUrl(capUrl)
                    # updateContentById(cid, content)
                else:
                    content, host = getAndParse(capUrl)
                    if not content:
                        print cid, ' getAndparse content failed, bookId : ', bookId
                        # continue
                        # updateContentById(cid, content)
                        # cap[4] = content
            capObj['content'] = content

            upload2Bucket(str(cid) + '.json', json.dumps(capObj))
        except Exception as e:
            print 'cid ', cid, 'error: ', e
        except ValueError as er:
            print 'cid ', cid, 'error: ', er