예제 #1
0
파일: Item_list+.py 프로젝트: Bearfu/JD
		# url = "http://list.jd.hk/list.html?cat=1319%2C1525%2C1548&gjz=0&go=0"
		# soup = commodity_list._Analyze_Soup(url)
		# 爬取完整的目录URL信息
		try:
			lists = soup.find_all('ul', {'class': 'menu-drop-list clearfix'})
			for list in lists:
				index = list.find_all('li')
				for li in index:
					name = li.a.string
					url = li.a['href']
					print(name)
					head = "http://list.jd.hk"
					url = name + "@@@" + head + url
					print(url)
					# 写入txt文件
					file.write(url + "\n")
		except:
			pass
	print("运行终了")





if __name__ == '__main__':
	with open('JD_menu+_url.txt', mode='w', encoding="utf-8") as file:
		_All_Index()
	#文件去重
	import Deduplication
	Deduplication._Deduplication("JD_menu+_url", "JD_index_url")
예제 #2
0
파일: item_list.py 프로젝트: Bearfu/JD

# 从本地读取爬到的静态网页,创建BeautifulSoup对象
def _JDURL():
	soup = BeautifulSoup(open("/Users/fuzhe/PycharmProjects/Python_other_Tools/Python_other_Tools/JD/jdhk.html", encoding="utf-8"))
	# 分类目录相关信息
	lists = soup.find_all('div', {'class': 'item'})
	# print(lists)
	for item in lists:
		href = item.find_all()
		for a in href:
			url = a.a
			if url is not None:
				try:
					name = a.a.string
					url = a.a['href']
					print(name)
					print(url)
					file.write(name + "@@@" + url + '\n')
				except:
					pass


if __name__ == '__main__':
	with open('JD_menu_urls.txt', mode='w', encoding="utf-8") as file:
		_JDURL()
	Deduplication._Deduplication("JD_menu_urls.txt", "JD_menu_url.txt")
	print("运行终了")