def main_second():
    '''
    --> 从第二层url中解析出第三层的url并保存到second_url.txt文件中
    :return:
    '''
    file_name = 'first_floor.txt'
    file_path = os.path.join(path, file_name)
    file = open(file_path, encoding='utf-8')

    for per_column in file:
        try:
            url_dict = per_column.strip()
            url_item = json.loads(url_dict)
            categories_name = url_item['categories_name']
            url = url_item['href']
            # print(url)  测试通过
            html = get_page_source(url)
            print(html)
            if categories_name in ["Women", "Men", "Girls", "Boys"]:
                parse_second_page_source_one(html, categories_name)
            elif categories_name in [
                    "Baby", "Novelty & More", "Luggage & Travel Gear"
            ]:
                parse_second_page_source_two(html, categories_name)
            else:
                parse_second_page_source_three(html, categories_name)

        except Exception as e:
            print(e)
            continue

    file.close()
Exemple #2
0
def get_clothing_child_url(clothing_name=None, clothing_url=None):
    '''
    同样适用shoes, Watches,
    get_clothing_child_url(clothing_url=None) --> 获取衣服类下面所有的子类url
    :param: clothing_name: Women:Clothing
    :return: url 组成的list
    '''
    # url = "https://www.amazon.com/s/ref=lp_7147440011_ex_n_2?rh=n%3A7141123011%2Cn%3A7147440011%2Cn%3A1040660&bbn=7147440011&ie=UTF8"
    html = get_page_source(clothing_url)
    # print(html)
    print('-' * 60)
    results = parse_clothing_child_page(html, clothing_name)  # 是一个list
    for clothing_child in results:
        clothing_child_dict = {}
        if clothing_child['name'] in [
                "Women:Clothing:Fashion Hoodies & Sweatshirts",
                "Women:Clothing:Jeans", "Women:Clothing:Leggings",
                "Women:Clothing:Jumpsuits, Rompers & Overalls"
        ]:
            clothing_child_dict["category_class"] = 0
            clothing_child_dict["category_levels"] = clothing_child['name']
            clothing_child_dict["category_url"] = clothing_child['href']
            store_category_list(clothing_child_dict)
        else:
            # pass
            dresses_level_name = clothing_child['name']
            dresses_level_url = clothing_child['href']
            print('获取第四层连接:', dresses_level_url)
            print('-' * 60)
            get_dresses_level_child_url_list(dresses_level_url,
                                             dresses_level_name)
def test_case():
    '''
    --> 用于测试逻辑结果是否符合预期和是否发生异常
    '''
    test_url = 'https://www.amazon.com/s/ref=lp_7141123011_ex_n_8/143-3620478-1851336?rh=n%3A7141123011%2Cn%3A7586144011&bbn=7141123011&ie=UTF8'
    html = get_page_source(test_url)
    print(html)
    parse_second_page_source_three(html, 'Uniforms, Work & Safety')
def get_dresses_level_child_url_list(dresses_level_url=None, dresses_level_name=None):
    '''
    获取裙子或者裙子级别相同的所有类目list
    :param: dresses_level_name: Women:Clothing:Dresses
    '''
    # dresses_url = 'https://www.amazon.com/s/ref=lp_1040660_ex_n_3/143-8365897-7769519?rh=n%3A7141123011%2Cn%3A7147440011%2Cn%3A1040660%2Cn%3A1045024&bbn=1040660&ie=UTF8'
    html = get_page_source(dresses_level_url)
    print('-'*60)
    parse_dresses_level_child_page(html, dresses_level_name)
def main_first():
    '''
    --> 把第一层的html解析出来第二层的url并保存到first_url.txt文件中
    '''

    first_url = "https://www.amazon.com/amazon-fashion/b/ref=topnav_storetab_sl?ie=UTF8&node=7141123011"
    html = get_page_source(first_url)  # 获取网页的源代码
    print(html)
    parse_first_page_source(html)  # 剖析第一层html
def test_case():
    # # url = "https://www.amazon.com/s/ref=lp_7141123011_ex_n_1?rh=n%3A7141123011%2Cn%3A7147440011&bbn=7141123011&ie=UTF8"
    # url = "https://www.amazon.com/s/ref=lp_7141123011_ex_n_11/137-9819825-1477654?rh=n%3A7141123011%2Cn%3A7586166011&bbn=7141123011&ie=UTF8"
    # html = get_page_source(url)
    # print(html)
    test_url = 'https://www.amazon.com/s/ref=lp_7141123011_ex_n_8/143-3620478-1851336?rh=n%3A7141123011%2Cn%3A7586144011&bbn=7141123011&ie=UTF8'
    html = get_page_source(test_url)
    print(html)
    parse_second_page_source_three(html, 'Uniforms, Work & Safety')
Exemple #7
0
def get_clothing_child_url(clothing_url=None):
    url = "https://www.amazon.com/s/ref=lp_7147440011_ex_n_2?rh=n%3A7141123011%2Cn%3A7147440011%2Cn%3A1040660&bbn=7147440011&ie=UTF8"
    html = get_page_source(url)
    # print(html)
    parse_clothing_child_page(html)