def get_author_list(str_url,site_category): soup = parser(str_url) plist = SoupStrainer(id="experts") content = soup.find_all(plist) dd_list = content[0].select("dd") cur_page_data_list = [] site = 'blog.csdn.net' for dd in dd_list: author_info_list = [] d_a = dd.select("a") url = smart_str(d_a[0]['href'].strip()) user_name = url.split('net/')[1] #real_name = smart_str(d_a[0].text.strip()) rss = url+'/rss/list' #print url author_info_list.append(site) author_info_list.append(url) #author_info_list.append(site_category) author_info_list.append(user_name) #author_info_list.append(real_name) author_info_list.append(rss) author_info_list.append(datehelper.now_datetime()) cur_page_data_list.append(tuple(author_info_list)) # print author_info_list return cur_page_data_list
def get_book_list(str_url): soup = parser(str_url) plist = SoupStrainer(id="plist") content = soup.find_all(plist) items = soup.select('.item') cur_page_data_list = [] for item in items: book_info_list = [] index = smart_str(item.select('.index')[0].contents[0].string) #图书排行 p_name = item.select('.p-name')[0].contents[0] book_url = smart_str(p_name['href']) #图书链接地址 book_name = smart_str(p_name.text) #图书名称 book_info = item.select('.p-info') book_publisher_auther = book_info[0] book_p_a_len = len(list(book_publisher_auther.select('a'))) book_auther = '' #图书作者 book_trans_auther = '' #图书译者 book_publisher = '' #图书出版社 if book_p_a_len == 2: book_auther = smart_str(book_publisher_auther.select('a')[1].text) book_publisher = smart_str(book_publisher_auther.select('a')[1].text) elif book_p_a_len == 1: book_auther = '' book_publisher = smart_str(book_publisher_auther.select('a')[0].text) elif book_p_a_len == 3: book_auther = smart_str(book_publisher_auther.select('a')[0].text) book_trans_auther = smart_str(book_publisher_auther.select('a')[1].text) book_publisher = smart_str(book_publisher_auther.select('a')[2].text) book_img = smart_str(item.select('.p-img.bookimg')[0].img['src']) book_prices = book_info[1] del_price = (smart_str(book_prices.select('del')[0].text)).replace('¥','') #定价 jd_price = (smart_str(book_prices.select('span')[0].text)).replace('¥','') #京东价 #print index,book_name,book_url #print book_auther,book_trans_auther,book_publisher #print del_price,jd_price #print book_img #print '|'.join(book_info_list) book_info_list.append(index) book_info_list.append(book_name) book_info_list.append(book_img) book_info_list.append(book_url) book_info_list.append(book_auther) book_info_list.append(book_trans_auther) book_info_list.append(book_publisher) book_info_list.append(del_price) book_info_list.append(jd_price) cur_page_data_list.append(tuple(book_info_list)) return cur_page_data_list
def countSale(str_url): soup = parser(str_url) items = soup.select('.item') sale = 0.0 #销售总额 for item in items: amount = 0 #单件商品销售数量 pricestr = item.select('strong')[0] amount_list = item.select('em') if len(amount_list) != 0: amount = int(amount_list[0].contents[0]) #price = re.sub(r'</?\w+[^>]*>','',pricestr) price_str = smart_str(pricestr.contents[0]) #price = filter(str.isdigit, price_str) price = float(filter(lambda ch: ch in r'0123456789.', price_str)) sale += round(price * amount, 2) return sale
def countSale(str_url): soup = parser(str_url) items = soup.select('.item') sale = 0.0 #销售总额 for item in items: amount = 0 #单件商品销售数量 pricestr = item.select('strong')[0] amount_list = item.select('em') if len(amount_list) != 0: amount = int(amount_list[0].contents[0]) #price = re.sub(r'</?\w+[^>]*>','',pricestr) price_str = smart_str(pricestr.contents[0]) #price = filter(str.isdigit, price_str) price =float(filter(lambda ch: ch in r'0123456789.', price_str)) sale += round(price*amount,2) return sale
def get_ssq_list(str_url): cur_page_data_list = [] soup = parser(str_url) plist = SoupStrainer("table") content = soup.find_all(plist) content_table = content[0] trs = content_table.find_all('tr') for i,tr in enumerate(trs): ssq_info_list = [] if i >1: tds = tr.find_all('td') if (len(tds)>5): kjrq = tds[0].text #开奖日期 jh = tds[1].text #期号 qiu = tds[2] ems = qiu.select('em') red1 = ems[0].text red2 = ems[1].text red3 = ems[2].text red4 = ems[3].text red5 = ems[4].text red6 = ems[5].text blue1 = ems[6].text sales = tds[3].contents[0].string #销售额 first = tds[4].contents[0].string #一等奖个数 second = tds[5].contents[0].string #二等奖个数 distribution = smart_str(tds[4].contents[1].string).replace("(","").replace("..","").replace(")","").strip() #一等奖分布 ssq_info_list.append(kjrq) ssq_info_list.append(jh) ssq_info_list.append(red1) ssq_info_list.append(red2) ssq_info_list.append(red3) ssq_info_list.append(red4) ssq_info_list.append(red5) ssq_info_list.append(red6) ssq_info_list.append(blue1) ssq_info_list.append(sales) ssq_info_list.append(first) ssq_info_list.append(second) ssq_info_list.append(distribution) cur_page_data_list.append(tuple(ssq_info_list)) #print kjrq,jh,red1,red2,red3,red4,red5,red6,blue1,sales,first,second,distribution return cur_page_data_list
def get_goods_data_list(str_url): soup = parser(str_url) goods_name = smart_str(soup.select('.tb-detail-hd')[0].h3.contents[0].string) #print goods_name property = soup.select('.tb-property') #attributes = soup.select('.attributes') #tb-meta goods_price = property[0].select('.tb-detail-price') j_str_price = goods_price[0].select('strong')[0].text #j_promo_price = goods_price[1].select('div')[0] #print goods_price #print j_str_price #print j_promo_price sold_out = property[0].select('.tb-sold-out.tb-clearfix') evaluate = property[0].select('.tb-evaluate.tb-clearfix') print property[0].select('.tb-key.tb-key-sku') print evaluate
def getTotalPage(str_url): soup = parser(str_url) page_info = soup.select('.page-info')[0] page_contents = page_info.contents[0] totalPage = (str(page_contents)).split('/')[1] return int(totalPage)
def get_book_list(str_url): soup = parser(str_url) plist = SoupStrainer(id="plist") content = soup.find_all(plist) items = soup.select('.item') cur_page_data_list = [] for item in items: book_info_list = [] index = smart_str(item.select('.index')[0].contents[0].string) #图书排行 p_name = item.select('.p-name')[0].contents[0] book_url = smart_str(p_name['href']) #图书链接地址 book_name = smart_str(p_name.text) #图书名称 book_info = item.select('.p-info') book_publisher_auther = book_info[0] book_p_a_len = len(list(book_publisher_auther.select('a'))) book_auther = '' #图书作者 book_trans_auther = '' #图书译者 book_publisher = '' #图书出版社 if book_p_a_len == 2: book_auther = smart_str(book_publisher_auther.select('a')[1].text) book_publisher = smart_str( book_publisher_auther.select('a')[1].text) elif book_p_a_len == 1: book_auther = '' book_publisher = smart_str( book_publisher_auther.select('a')[0].text) elif book_p_a_len == 3: book_auther = smart_str(book_publisher_auther.select('a')[0].text) book_trans_auther = smart_str( book_publisher_auther.select('a')[1].text) book_publisher = smart_str( book_publisher_auther.select('a')[2].text) book_img = smart_str(item.select('.p-img.bookimg')[0].img['src']) book_prices = book_info[1] del_price = (smart_str(book_prices.select('del')[0].text)).replace( '¥', '') #定价 jd_price = (smart_str(book_prices.select('span')[0].text)).replace( '¥', '') #京东价 #print index,book_name,book_url #print book_auther,book_trans_auther,book_publisher #print del_price,jd_price #print book_img #print '|'.join(book_info_list) book_info_list.append(index) book_info_list.append(book_name) book_info_list.append(book_img) book_info_list.append(book_url) book_info_list.append(book_auther) book_info_list.append(book_trans_auther) book_info_list.append(book_publisher) book_info_list.append(del_price) book_info_list.append(jd_price) cur_page_data_list.append(tuple(book_info_list)) return cur_page_data_list