Ejemplo n.º 1
0
def extract_htmlpage_products(url, coding):
	url_file = open("url", "w")
	base_url1 = "http://www.360buy.com/"
	base_url2 = "http://www.360buy.com"
	htmlpage_soup = Utils.__htmlpage_soup(url, coding)
	blocks = htmlpage_soup.findAll("div", attrs={"class":"mt"})
	if not blocks:
		return (None, None)
	blocks_sub = htmlpage_soup.findAll("em")
	products_top_catalog_href = [] #一级目录
	products_sub_catalog_href = [] #二级目录
	for block in blocks:
		catalog = Utils.extract_text_from_htmlline(str(block)).strip()
		href = Utils.extract_href_from_htmlline(str(block)).strip()
		products_top_catalog_href.append((catalog, href))
	for block_sub in blocks_sub:
		parts = Utils.split_htmlline2parts(str(block_sub).strip(), "</em>")
		for part in parts:
			catalog_sub = Utils.extract_text_from_htmlline(str(part)).strip()
			if len(catalog_sub) == 0:
				continue
			href_sub = Utils.extract_href_from_htmlline(str(part)).strip()
			if href_sub.find("http://") == -1:
				if href_sub[0] == '/':
					href_sub = base_url2 + href_sub
				else:
					href_sub = base_url1 + href_sub
			products_sub_catalog_href.append((catalog_sub, href_sub))
	for item in products_sub_catalog_href:
		print >> url_file, item[0].encode("utf8"), " ", item[1].encode("utf8")
	url_file.close()
	return products_top_catalog_href, products_sub_catalog_href
Ejemplo n.º 2
0
def extract_products_pagenum(url, coding):
	htmlpage_soup = Utils.__htmlpage_soup(url, coding)
	if htmlpage_soup == -1:
		return -1, -1
	block = htmlpage_soup.find("div", attrs={"class":"pagin fr"})
	if not block:
		return -1, -1
	maxnum = Utils.extract_maxnum_from_htmlline(str(block))
	href_base = Utils.extract_mutil_href_from_htmlline(str(block))
	return maxnum, href_base
Ejemplo n.º 3
0
def products_id_maker(url, coding): 
	if not url:
		return -1
	products_id = []
	soup = Utils.__htmlpage_soup(url, coding)
	if soup == -1:
		return -1
	ids = soup.findAll('li', {'sku':True})
	for i in ids:
		products_id.append(str(i['sku']))
	return products_id
Ejemplo n.º 4
0
def get_reviews_page_num(url, coding):
	pagination_soup = Utils.__htmlpage_soup(url, coding)
	if pagination_soup == -1:
		return -1
	pagination = pagination_soup.findAll("div", attrs={"class":"Pagination"})
	soup2=BeautifulSoup(str(pagination))
	at = soup2.findAll('a')
	max_num = 0
	for i in at:
		try:
			m = Utils.str2int(i.text)
		except:
			continue
		if m > max_num:
			max_num = m
	return max_num