Exemple #1
0
def _catch_Index_Url():
    print("开始从TXT文件中获取url")
    All_line = Creep_Tools.Read_Line_by_Line._Read_Line_by_Line("JD_index_url.txt")
    for line in All_line:
        line = line.split("@@@")
        index_name = line[0]
        index_url = line[1]
        # 获取此URL下的soup
        soup = _Analyze_Soup(index_url)
        # 获取此URL下的最大页数
        Max_Pag = int(_Page_num(soup))
        print(index_name + "MAX_Pag" + str(Max_Pag))
        if Max_Pag == 0:
            print(Max_Pag)
        elif Max_Pag == 1:
            try:
                print(Max_Pag)
                url = index_url
                soup = _Analyze_Soup(url)
                if soup is not None:
                    # 获取此URL下的所有商品页面的URL
                    url_list = parser_for_one_url(soup)
                    for url in url_list:
                        print(index_name + "@@@" + url + "@@@" + Max_Pag)
                        commdity_url = index_name + "@@@" + url
                        file.write(commdity_url + "\n")
                else:
                    pass
            except:
                pass
        else:
            try:
                for page in range(1, Max_Pag):
                    print(index_name + str(page))
                    page = "&page=" + str(page)
                    print(index_name + page)
                    try:
                        url = index_url + page
                        soup = Creep_Tools._Analyze_Soup(url)
                        if soup is not None:
                            # 获取此URL下的所有商品页面的URL
                            url_list = parser_for_one_url(soup)
                            for url in url_list:
                                print(index_name + "@@@" + url + "@@@" + page)
                                commdity_url = index_name + "@@@" + url + page
                                file.write(commdity_url + "\n")
                        else:
                            pass
                    except:
                        pass
            except:
                pass
    print("")
Exemple #2
0
def _Main():
	All_Line_Arr = Creep_Tools.Read_Line_by_Line._Read_Line_by_Line("goods_id.log")
	for All_Line in All_Line_Arr:
		Line_arr = All_Line.split('@@@')
		commodity_index = Line_arr[0]
		commodity_url = Line_arr[1]
		ID_arr = commodity_url.split('.')
		ID = ID_arr[2]
		ID_arr = ID.split('/')
		ID = ID_arr[1]
		# 2.创建对应的文件夹
		code_picture_path = "E:/JD/" + commodity_index + "/" + ID + "/code_picture"
		detail_picture_path = "E:/JD/" + commodity_index + "/" + ID + "/detail_picture"
		Creep_Tools._mkdir(code_picture_path)
		# 3.获取网页的HTML文件
		import commodity_list
		# 4.解析网页的soup文件

		# 获取商品图片的
		soup = Creep_Tools._Analyze_Soup(commodity_url)
		ul = soup.find('ul', {'class': "lh"})
		if ul:
			all_li = ul.find_all('li')
			num = 0
			for li in all_li:
				num = num +1
				img = dict(li.contents[0].attrs)['data-url']
				img_url = 'https://img11.360buyimg.com/popWaterMark/%s' % (img)
				save_path = code_picture_path
				ID = ID + str(num)
				Creep_Tools._Download_Picture(img_url, ID, save_path)
		# 获取商品详情
		product_detail = get_product_detail(soup)
		print(product_detail)

		# 获取商品价格
		price = get_product_price(soup)
		print(price)
		savePath = get_datail_picture(soup)
		print(savePath)
Exemple #3
0
def parser_for_one_url(soup):
    url_list = []
    try:
        lists = soup.find_all("ul", {"class": "gl-warp clearfix"})
        for item in lists:
            hrefs = item.find_all()
            for herf in hrefs:
                names = herf.find_all("div", {"class": "p-name"})
                for name in names:
                    url = name.a["href"]
                    if url is not None:
                        try:
                            url_list.append(url)
                        except:
                            pass
                    else:
                        print("soup为空")
    except:
        pass
    return url_list


if __name__ == "__main__":
    with open("JD_commodity_urls.txt", mode="w", encoding="utf-8") as file:
        _catch_Index_Url()
    Creep_Tools._Deduplication("JD_commodity_urls.log")
    # 测试用URL
    # soup =_Analyze_Soup("http://list.jd.hk/list.html?cat=1319,1525,7057&go=0&gjz=0")
    # parser_for_one_url(soup)
    print("运行终了")