Python parserの例、utils.bs4helper.parser Pythonの例

コード例 #1

0

ファイルを表示

ファイル: csdn_author.py プロジェクト: huligong1234/python-study

def get_author_list(str_url,site_category):
    soup = parser(str_url)
    plist = SoupStrainer(id="experts")
    content = soup.find_all(plist)
    dd_list = content[0].select("dd")

    cur_page_data_list = []

    site = 'blog.csdn.net'
    for dd in dd_list:
        author_info_list = []
        d_a =  dd.select("a")
        url = smart_str(d_a[0]['href'].strip())
        user_name = url.split('net/')[1]
        #real_name = smart_str(d_a[0].text.strip())

        rss = url+'/rss/list'

        #print url
        author_info_list.append(site)
        author_info_list.append(url)
        #author_info_list.append(site_category)
        author_info_list.append(user_name)
        #author_info_list.append(real_name)
        author_info_list.append(rss)
        author_info_list.append(datehelper.now_datetime())
        cur_page_data_list.append(tuple(author_info_list))
       # print author_info_list
    return cur_page_data_list

コード例 #2

0

ファイルを表示

ファイル: 360buy_booktop_computer.py プロジェクト: huligong1234/python-study

def get_book_list(str_url):
    soup = parser(str_url)
    plist = SoupStrainer(id="plist")
    content = soup.find_all(plist)
    items = soup.select('.item')
    cur_page_data_list = []
    for item in items:
        book_info_list = []
        index = smart_str(item.select('.index')[0].contents[0].string) #图书排行

        p_name = item.select('.p-name')[0].contents[0]
        book_url = smart_str(p_name['href']) #图书链接地址
        book_name = smart_str(p_name.text) #图书名称

        book_info = item.select('.p-info')
        book_publisher_auther = book_info[0]

        book_p_a_len =  len(list(book_publisher_auther.select('a')))
        book_auther = '' #图书作者
        book_trans_auther = '' #图书译者
        book_publisher = '' #图书出版社
        if book_p_a_len == 2:
            book_auther = smart_str(book_publisher_auther.select('a')[1].text)
            book_publisher = smart_str(book_publisher_auther.select('a')[1].text)
        elif book_p_a_len == 1:
            book_auther = ''
            book_publisher = smart_str(book_publisher_auther.select('a')[0].text)
        elif book_p_a_len == 3:
           book_auther =  smart_str(book_publisher_auther.select('a')[0].text)
           book_trans_auther = smart_str(book_publisher_auther.select('a')[1].text)
           book_publisher = smart_str(book_publisher_auther.select('a')[2].text)

        book_img = smart_str(item.select('.p-img.bookimg')[0].img['src'])

        book_prices = book_info[1]
        del_price = (smart_str(book_prices.select('del')[0].text)).replace('￥','') #定价
        jd_price = (smart_str(book_prices.select('span')[0].text)).replace('￥','') #京东价

        #print index,book_name,book_url
        #print book_auther,book_trans_auther,book_publisher
        #print del_price,jd_price
        #print book_img
        #print '|'.join(book_info_list)

        book_info_list.append(index)
        book_info_list.append(book_name)
        book_info_list.append(book_img)
        book_info_list.append(book_url)
        book_info_list.append(book_auther)
        book_info_list.append(book_trans_auther)
        book_info_list.append(book_publisher)
        book_info_list.append(del_price)
        book_info_list.append(jd_price)
        cur_page_data_list.append(tuple(book_info_list))
    return cur_page_data_list

コード例 #3

0

ファイルを表示

ファイル: taobao_shop_sale.py プロジェクト: huligong1234/python-study

def countSale(str_url):
    soup = parser(str_url)
    items = soup.select('.item')
    sale = 0.0  #销售总额
    for item in items:
        amount = 0  #单件商品销售数量
        pricestr = item.select('strong')[0]
        amount_list = item.select('em')
        if len(amount_list) != 0:
            amount = int(amount_list[0].contents[0])
            #price = re.sub(r'</?\w+[^>]*>','',pricestr)
        price_str = smart_str(pricestr.contents[0])
        #price = filter(str.isdigit, price_str)
        price = float(filter(lambda ch: ch in r'0123456789.', price_str))
        sale += round(price * amount, 2)
    return sale

コード例 #4

0

ファイルを表示

ファイル: taobao_shop_sale.py プロジェクト: huligong1234/python-study

def countSale(str_url):
    soup = parser(str_url)
    items = soup.select('.item')
    sale = 0.0 #销售总额
    for item in items:
        amount = 0 #单件商品销售数量
        pricestr = item.select('strong')[0]
        amount_list = item.select('em')
        if len(amount_list) != 0:
            amount = int(amount_list[0].contents[0])
            #price = re.sub(r'</?\w+[^>]*>','',pricestr)
        price_str = smart_str(pricestr.contents[0])
        #price = filter(str.isdigit, price_str)
        price =float(filter(lambda ch: ch in r'0123456789.', price_str))
        sale += round(price*amount,2)
    return sale

コード例 #5

0

ファイルを表示

ファイル: ssq.py プロジェクト: huligong1234/python-study

def get_ssq_list(str_url):
    cur_page_data_list = []
    soup = parser(str_url)
    plist = SoupStrainer("table")
    content = soup.find_all(plist)
    content_table =  content[0]
    trs = content_table.find_all('tr')
    for i,tr in enumerate(trs):
        ssq_info_list = []
        if i >1:
            tds = tr.find_all('td')
            if (len(tds)>5):
                kjrq = tds[0].text #开奖日期
                jh = tds[1].text #期号
                qiu = tds[2]
                ems = qiu.select('em')
                red1 = ems[0].text
                red2 = ems[1].text
                red3 = ems[2].text
                red4 = ems[3].text
                red5 = ems[4].text
                red6 = ems[5].text
                blue1 = ems[6].text
                sales = tds[3].contents[0].string #销售额
                first = tds[4].contents[0].string #一等奖个数
                second = tds[5].contents[0].string #二等奖个数
                distribution = smart_str(tds[4].contents[1].string).replace("(","").replace("..","").replace(")","").strip() #一等奖分布

                ssq_info_list.append(kjrq)
                ssq_info_list.append(jh)
                ssq_info_list.append(red1)
                ssq_info_list.append(red2)
                ssq_info_list.append(red3)
                ssq_info_list.append(red4)
                ssq_info_list.append(red5)
                ssq_info_list.append(red6)
                ssq_info_list.append(blue1)
                ssq_info_list.append(sales)
                ssq_info_list.append(first)
                ssq_info_list.append(second)
                ssq_info_list.append(distribution)
                cur_page_data_list.append(tuple(ssq_info_list))
                #print kjrq,jh,red1,red2,red3,red4,red5,red6,blue1,sales,first,second,distribution

    return cur_page_data_list

コード例 #6

0

ファイルを表示

ファイル: taobao_shop_goods.py プロジェクト: huligong1234/python-study

def get_goods_data_list(str_url):
    soup = parser(str_url)
    goods_name = smart_str(soup.select('.tb-detail-hd')[0].h3.contents[0].string)
    #print goods_name
    property = soup.select('.tb-property')
    #attributes = soup.select('.attributes')
    #tb-meta
    goods_price = property[0].select('.tb-detail-price')
    j_str_price = goods_price[0].select('strong')[0].text
    #j_promo_price = goods_price[1].select('div')[0]
    #print goods_price
    #print j_str_price
    #print j_promo_price
    sold_out = property[0].select('.tb-sold-out.tb-clearfix')
    evaluate = property[0].select('.tb-evaluate.tb-clearfix')
    print property[0].select('.tb-key.tb-key-sku')

    print evaluate

コード例 #7

0

ファイルを表示

ファイル: taobao_shop_sale.py プロジェクト: huligong1234/python-study

def getTotalPage(str_url):
    soup = parser(str_url)
    page_info = soup.select('.page-info')[0]
    page_contents = page_info.contents[0]
    totalPage = (str(page_contents)).split('/')[1]
    return int(totalPage)

コード例 #8

0

ファイルを表示

ファイル: taobao_shop_sale.py プロジェクト: huligong1234/python-study

def getTotalPage(str_url):
    soup = parser(str_url)
    page_info = soup.select('.page-info')[0]
    page_contents = page_info.contents[0]
    totalPage = (str(page_contents)).split('/')[1]
    return int(totalPage)

コード例 #9

0

ファイルを表示

def get_book_list(str_url):
    soup = parser(str_url)
    plist = SoupStrainer(id="plist")
    content = soup.find_all(plist)
    items = soup.select('.item')
    cur_page_data_list = []
    for item in items:
        book_info_list = []
        index = smart_str(item.select('.index')[0].contents[0].string)  #图书排行

        p_name = item.select('.p-name')[0].contents[0]
        book_url = smart_str(p_name['href'])  #图书链接地址
        book_name = smart_str(p_name.text)  #图书名称

        book_info = item.select('.p-info')
        book_publisher_auther = book_info[0]

        book_p_a_len = len(list(book_publisher_auther.select('a')))
        book_auther = ''  #图书作者
        book_trans_auther = ''  #图书译者
        book_publisher = ''  #图书出版社
        if book_p_a_len == 2:
            book_auther = smart_str(book_publisher_auther.select('a')[1].text)
            book_publisher = smart_str(
                book_publisher_auther.select('a')[1].text)
        elif book_p_a_len == 1:
            book_auther = ''
            book_publisher = smart_str(
                book_publisher_auther.select('a')[0].text)
        elif book_p_a_len == 3:
            book_auther = smart_str(book_publisher_auther.select('a')[0].text)
            book_trans_auther = smart_str(
                book_publisher_auther.select('a')[1].text)
            book_publisher = smart_str(
                book_publisher_auther.select('a')[2].text)

        book_img = smart_str(item.select('.p-img.bookimg')[0].img['src'])

        book_prices = book_info[1]
        del_price = (smart_str(book_prices.select('del')[0].text)).replace(
            '￥', '')  #定价
        jd_price = (smart_str(book_prices.select('span')[0].text)).replace(
            '￥', '')  #京东价

        #print index,book_name,book_url
        #print book_auther,book_trans_auther,book_publisher
        #print del_price,jd_price
        #print book_img
        #print '|'.join(book_info_list)

        book_info_list.append(index)
        book_info_list.append(book_name)
        book_info_list.append(book_img)
        book_info_list.append(book_url)
        book_info_list.append(book_auther)
        book_info_list.append(book_trans_auther)
        book_info_list.append(book_publisher)
        book_info_list.append(del_price)
        book_info_list.append(jd_price)
        cur_page_data_list.append(tuple(book_info_list))
    return cur_page_data_list