def get_tree_hierarchy(hierarchy, url):

    :param hierarchy: last level hierarchy name
    :param url: current page url
    :return: None

    response = get_content(url)
    if response:
        category_container = response.findAll('a', {'class': 'nav-a'})
        if len(category_container) != 0:
            for anchor_tag in category_container:
                # if anchor_tag.get('tabindex') == '66' or anchor_tag.get('tabindex') =='67':
                category_name = string_format(anchor_tag)
                if category_name in SELECTED_LIST:
                    category_url = url_format(anchor_tag['href'])

                    hierarchy_name = '{}|{}'.format(hierarchy, category_name)
                    response = get_content(category_url)
                    category_response_type_1 = response.find(
                        'div', {'class': LEFT_NAV_CLASS})
                    category_response_type_2 = response.find(
                        'ol', {'class': CAROUSAL_CLASS})
                    if category_response_type_1:

                        find_nav_hierarchy(hierarchy_name, response)
                    elif category_response_type_2:

                        find_carousel_hierarchy(hierarchy_name, response)
Example #2
def get_tree_hierarchy(hierarchy, url):

    :param hierarchy: Hierarchy name
    :param url: current page url
    :return: NOne
    response = get_content(url)
    if response:
        container = response.find('div', {'class': BOX_GRID_CONTAINER_CLASS})
        if container:
            category_containers = container.findAll(
                'div', {'class': SMALL_BOX_GRID_CLASS})
            if len(category_containers) != 0:
                for category in category_containers:
                    category_name = string_format(category.img['alt'])
                    category_url = url_format(category.find('a')['href'])

                    hierarchy_name = '{}|{}'.format(hierarchy, category_name)

                    category_response = get_content(category_url)
                    :logic - At first we will find all possible class names from response 
                    and we will check which type of class got response and call that 
                    specific function to proceed further
                    if category_response:

                        category_response_type_1 = category_response.find(
                            "ul", {'class': INDENT_ONE_CLASS})
                        category_response_type_2 = category_response.findAll(
                            'div', {'class': SMALL_BOX_GRID_CLASS})
                        category_response_type_3 = category_response.find(
                            'div', {'class': LEFT_NAV_CLASS})
                        category_response_type_4 = category_response.find(
                            'ol', {'class': CAROUSAL_CLASS})

                        if category_response_type_1:

                        elif len(category_response_type_2) != 0:

                        elif category_response_type_3:

                        elif category_response_type_4:

                            line = '{}|{}'.format(hierarchy_name, category_url)

def get_tree_hierarchy(hierarchy, url):

    :param hierarchy: hierarchy name
    :param url: current page url
    :return: none

    resposne = get_content(url)
    if resposne:
        category_container_1 = resposne.find_all(
            'div', {'class': 'acs-ux-innerc1 acs-category-tile-links '})
        category_container_2 = resposne.find_all(
            'div', {'class': 'acs-ux-innerc2 acs-category-tile-links '})
        category_container_3 = resposne.find_all(
            'div', {'class': 'acs-ux-innerc3 acs-category-tile-links '})

        for category_container in category_container_1:
            collect_urls(hierarchy, category_container)

        for category_container in category_container_2:
            collect_urls(hierarchy, category_container)

        for category_container in category_container_3:
            collect_urls(hierarchy, category_container)

        # wait till all the urls complete in queue
def get_product_urls(hierarchy_url):

    :param hierarchy_url: string in this format 'hierarchy|page_url"
    :return: None
    :working : requsts the url and then find last page and call's traverse_page function
    name_list = hierarchy_url.split('|')
    hierarchy_name = '|'.join(name_list[0:-1])

    page_url = name_list[-1]

    urls_list = []

    hierarchy_path = '/'.join(name_list[2:-1])
    completed_path = '{}{}{}'.format(DataCollectors_Configuration.LINIO_MEX_URL_ROOT,
                                     DataCollectors_Configuration.PATH_STYLE, hierarchy_path)

    response = get_content(page_url)

    if response:

        product_url_tags = response.find_all('div', {'class': 'catalogue-product row'})
        if len(product_url_tags) != 0:
            for product_url_tag in product_url_tags:
                anchor_tag = product_url_tag.find('a')
                product_url = '{}{}'.format(MAIN_URL,anchor_tag['href'])
                line = '{}|{}'.format(hierarchy_name, product_url)

            update_files(completed_path, hierarchy_name, urls_list, page_url, PRODUCTS_INFO_FILE, COMPLETED_PAGE)
Example #5
def get_tree_hierarchy(hierarchy, url):

    :param hierarchy: category_hierarchy name
    :param url: current_page_url
    :return: None
    :working: This function will find top_level of hierarchy and stores them in a queue and then starts Threadpool
    response = get_content(url)

    if response:
        categories_container = response.find('div', {'class': LEFT_NAV_CLASS})

        if categories_container:
            anchor_tags = categories_container.findAll('a')
            if len(anchor_tags) != 0:
                for anchor_tag in anchor_tags:
                    categories_name = string_format(anchor_tag)
                    categories_url = url_format(anchor_tag['href'])

                    hierarchy_name = '{}|{}'.format(hierarchy, categories_name)

                    line = '{}|{}'.format(hierarchy_name, categories_url)

Example #6
def form_hierarchy(hierarchy, url):
    path = hierarchy.split('|')
    print 'Started {} Hierarchy Collection'.format(path[-1])
    start_time = time.time()
    page_container = response_getter.get_content(url)
    if page_container:
        content_containers_1 = page_container.find_all(
            'div', {'class': 'banner-layout-5'})
        content_containers_2 = page_container.find_all(
            'div', {'class': 'banner-layout-4'})
        content_containers_3 = page_container.find_all(
            'div', {'class': 'banner-layout-8'})
        content_containers_4 = page_container.find_all(
            'div', {'class': 'banner-layout-10'})

        if content_containers_1:
            first_level_hierarchy(hierarchy, content_containers_1)
        if content_containers_2:
            first_level_hierarchy(hierarchy, content_containers_2)
        if content_containers_3:
            first_level_hierarchy(hierarchy, content_containers_3)
        if content_containers_4:
            first_level_hierarchy(hierarchy, content_containers_4)

    end_time = time.time()

    total = end_time - start_time

    print '{} hierarchy collected |Started -> {} secs | Ended -> {} secs| Total -> {} secs '.format(
        path[-1], start_time, end_time, total)
def find_nav_hierarchy(hierarchy, url):

    :param hierarchy:hierarchy name
    :param url: current url
    :return: None
    response = get_content(url)
    if response:
        nav_container = response.find('div', {'class': LEFT_NAV_CLASS})
        if nav_container:
            nav_string = str(nav_container).split('<h3>')
            # for all nav_string find see more tag and hirarachy name and
            for nav in nav_string:
                nav_html = BeautifulSoup(nav, 'lxml')
                if nav_html:
                    category_container = nav_html.find('p')
                    if category_container:
                        main_category_name = string_format(category_container)
                        see_more_tag = nav_html.find("p",
                                                     {"class": SEE_MORE_CLASS})
                        if see_more_tag:
                            category_url = url_format(see_more_tag.a["href"])
                            hierarchy_name = '{}|{}'.format(
                                hierarchy, main_category_name)

                            # for current url find the traverse style as it was
                            find_traverse_type(hierarchy_name, category_url,
def find_traverse_type(hierarchy, url):
    response = get_content(url)
    if response:
        category_response_type_1 = response.find('div',
                                                 {'class': LEFT_NAV_CLASS})
        category_response_type_2 = response.find('ol',
                                                 {'class': CAROUSAL_CLASS})
        category_response_type_3 = response.find('ul',
                                                 {'class': INDENT_TWO_CLASS})
        category_response_type_4 = response.find(
            'div', {'class': ACS_WIDGET_LEFT_NAV_CLASS})

        if category_response_type_3:

            get_indent_two_hierarchy(hierarchy, response, url)

        elif category_response_type_4:

            find_acs_nav_section(hierarchy, response)

        elif category_response_type_1:

            find_nav_hierarchy(hierarchy, response)

        elif category_response_type_2:

            find_carousel_hierarchy(hierarchy, response)
def find_hierarchy(hierarchy, url):

    :param hierarchy: category_hierarchy name
    :param url: current_page_url
    :return: None
    :working: recurssion function to find the hierarchy, last_page and products_page_url
    response = get_content(url)
    if response:
        sub_categories_container = response.find('ul',
                                                 {'class': INDENT_TWO_CLASS})

        if sub_categories_container:
            anchor_tags = sub_categories_container.findAll(
                'a', {'class': NORMAL_ANCHOR_TAG_CLASS})

            # If length of anchor tags is not zero then it contains more categories
            if len(anchor_tags) != 0:
                # Now for each category_url again call find_hierarchy function
                for anchor_tag in anchor_tags:
                    category_name = string_format(anchor_tag)
                    category_url = url_format(anchor_tag['href'])

                    hierarchy_name = '{}|{}'.format(hierarchy, category_name)

                    find_hierarchy(hierarchy_name, category_url)

            # If length of anchor tags is zero then it is the last level of hierarchy
            # Now create directory and save hierarchy and url in a file in that directory
                store_last_level_of_hierarchy(hierarchy, response, url)
Example #10
def get_product_info(hierarchy_url):

    :param hierarchy_url: string in this format 'hierarchy|page_url"
    :return: None
    :working : requsts the url and then find last page and call's traverse_page function
    name_list = hierarchy_url.split('|')
    hierarchy_name = '|'.join(name_list[0:-1])

    page_url = name_list[-1]

    # urls_list = []

    hierarchy_path = '/'.join(name_list[2:-1])
    completed_path = '{}{}{}'.format(DataCollectors_Configuration.LINIO_MEX_INFO_ROOT,
                                     DataCollectors_Configuration.PATH_STYLE, hierarchy_path)

    response = get_content(page_url)

    if response:
        data, date, time = get_details(hierarchy_url, response)

        if data:
            store(MARKETPLACE, date, hierarchy_name, time, data)
            file_path = '{}/{}'.format(completed_path, COMPLETED_INFO_FILE)
            # print file_path
            append_to_file(file_path, hierarchy_url)
def get_box_grid_hierarchy(hierarchy, url):

    :param hierarchy: hierarchy name
    :param url: current page url
    :return: none
    :working: find the hierarchy of category which contains small_box_grid as a class name and adds links to the queue

    response = get_content(url)
    if response:
        category_container_1 = response.findAll(
            'div', {'class': SMALL_BOX_GRID_CLASS})
        # category_container_2 = response.findAll('div', {'class': LARGE_BOX_GRID_CLASS})

        if len(category_container_1) != 0:
            for category in category_container_1:
                anchor_tag = category.find('a')
                if anchor_tag:
                    category_name = string_format(
                    category_url = url_format(anchor_tag['href'])
                    hierarchy_name = '{}|{}'.format(hierarchy, category_name)
                    line = '{}|{}'.format(hierarchy_name, category_url)
                    # add links to the queue

def get_tree_hierarchy(hierarchy, url):

    :param hierarchy: category_hierarchy name
    :param url: current_page_url
    :return: None
    :working: this function will find out 1st level of hierarchy  and adds link to the queue
    response = get_content(url)

    if response:
        sub_categories_container = response.find('ul',
                                                 {'class': INDENT_ONE_CLASS})
        if sub_categories_container:
            anchor_tags = sub_categories_container.findAll(
                'a', {'class': NORMAL_ANCHOR_TAG_CLASS})
            for anchor_tag in anchor_tags:
                category_name = string_format(anchor_tag)
                category_url = url_format(anchor_tag['href'])

                hierarchy_name = '{}|{}'.format(hierarchy, category_name)
                line = '{}|{}'.format(hierarchy_name, category_url)

                # print(line)
Example #13
def get_tree_hierarchy(main_category_name, url):

    :param main_category_name: Hierarchy name
    :param url: current_page_url

    This is the staring of the category hierarchy collection 
    for each category it will go recurvisely and find all sub_sub_categories
    and stores it in a hierarchy directory structure 
    response = get_content(url)
    if response:
        category_container = response.find("ul", {'class': INDENT_NONE_CLASS})
        if category_container:
            sub_category_container = category_container.find("span", {'class': 'a-list-item'})
            if sub_category_container:
                sub_category_list = sub_category_container.findAll('a', {'class': NORMAL_ANCHOR_TAG_CLASS})
                for sub_category in sub_category_list:
                    sub_sub_category_name = string_format(sub_category)
                    hierarchy_name = '{}|{}'.format(main_category_name, sub_sub_category_name)
                    hierarchy_url = url_format(sub_category['href'])

                    line = '{}|{}'.format(hierarchy_name, hierarchy_url)

Example #14
def get_nav_hierarchy(hierarchy, url):

    :param hierarchy:hierarchy name
    :param url: current url
    :return: None
    :working : It will find the 1st level of hierarchy and add the links to the queue
    response_container = get_content(url)
    if response_container:
        response = response_container.find('div', {'class': 'a-section a-spacing-base'})
        if response:

            nav_container = response.find('div', {'class': LEFT_NAV_CLASS})
            if nav_container:

                nav_string = str(nav_container).split('<h3>')
                # for all nav_string find see more tag and hirarachy name and  store sub category url
                for nav in nav_string:
                    nav_html = BeautifulSoup(nav, 'lxml')
                    if nav_html:
                        category_container = nav_html.find('p')
                        if category_container:
                            main_category_name = string_format(category_container)
                            see_more_tag = nav_html.find("p", {"class": SEE_MORE_CLASS})
                            if see_more_tag:
                                category_url = url_format(see_more_tag.a["href"])
                                hierarchy_name = '{}|{}'.format(hierarchy, main_category_name)
                                line = '{}|{}'.format(hierarchy_name, category_url)
                                print line
def collect_main_page_urls(main_url):
    :param main_url: Home page url
    :return: dictionary with category_names as key and urls as values
    get_content is function which takes url as input then parse it and returns html response back
    raw_data = get_content(main_url)
    category_name_and_url = {}  # Dictionary to store category names and urls
    if raw_data:
        category_containers = raw_data.findAll('div',
                                               {'class': 'popover-grouping'})
        for category in category_containers:
            category_name_tag = category.find(
                'h2', {'class': 'popover-category-name'})
            category_name = string_format(category_name_tag)
            urls_tag = category.findAll('a')
            for url in urls_tag:
                sub_category_1 = string_format(url)
                name_key = '{}|{}'.format(category_name, sub_category_1)
                url_value = url['href']
                # The url which we get does not contain domain name so we should concat the domain name
                url = '{}{}'.format(DOMAIN_NAME, url_value)
                category_name_and_url[name_key] = url

        return category_name_and_url
        print('got none')
def get_product_urls(hierarchy_url):

    :param hierarchy_url: string in this format 'hierarchy|page_url"
    :return: None
    :working : requsts the url and then find last page and call's traverse_page function
    name_list = hierarchy_url.split('|')
    hierarchy_name = '|'.join(name_list[0:-1])
    page_url = name_list[-1]

    urls_list = []
    completed_path = '{}{}{}{}{}{}{}'.format(DataCollectors_Configuration.ROOT_FOLDER, DataCollectors_Configuration.PATH_STYLE, DataCollectors_Configuration.AMAZON_CANADA_PROJECT_NAME,
                                             DataCollectors_Configuration.PATH_STYLE, hierarchy_name.replace('|', DataCollectors_Configuration.PATH_STYLE),
                                             DataCollectors_Configuration.PATH_STYLE, COMPLETED_PAGE)
    completed_set = file_to_set(completed_path)
    if in_completed_urls(page_url, completed_set):
        response = get_content(page_url)
        if response:

            product_url_tags = response.findAll('a', {
                'class': 'a-link-normal s-access-detail-page s-color-twister-title-link a-text-normal'})
            if len(product_url_tags) != 0:
                for product_url_tag in product_url_tags:
                    product_url = url_format(product_url_tag['href'])
                    line = '{}|{}'.format(hierarchy_name, product_url)

                update_files(hierarchy_name, urls_list, page_url, PRODUCTS_INFO_FILE, COMPLETED_PAGE)

                last_page = find_last_page(response)
                traverse_pages(hierarchy_name, page_url, last_page, completed_set)
Example #17
def find_acs_nav_section(hierarchy, url):
    response = get_content(url)
    if response:
        category_container = response.find(
            'div', {'class': ACS_WIDGET_LEFT_NAV_CLASS})
        if category_container:
            category_list = category_container.findAll(
                'div', {'class': ACS_SECTION_CLASS})
            if len(category_list) != 0:
                for category in category_list:
                    category_tag = category.find('button',
                                                 {'class': 'acs-ln-header '})
                    if category_tag:

                        category_name = string_format(category_tag)
                        if category_name in IGNORE_LIST:
                            sub_category_links = category.findAll('a')
                            if len(sub_category_links) != 0:
                                for sub_category_link in sub_category_links:
                                    sub_category_name = string_format(
                                    sub_category_url = url_format(

                                    hierarchy_name = '{}|{}|{}'.format(
                                        hierarchy, category_name,
                                    if 'Tout' in hierarchy_name:
                                            hierarchy_name, sub_category_url,
Example #18
def collect_all_data(hierarchy, url, last_page, completed_set):

       :param hierarchy: hierarchy name
       :param url: current page url
       :param last_page: last page number
       :param completed_set: completed url sets to compare
       :return: None
       :working: collects products url from all pages
    url_list = []
    for pageNo in range(2, last_page):
        current_page = '{}&page={}'.format(url, pageNo)
        if in_completed_urls(current_page, completed_set):

            response = get_content(current_page)
            if response:
                product_url_tags = response.findAll(
                    'a', {
                        'a-link-normal s-access-detail-page s-color-twister-title-link a-text-normal'
                if len(product_url_tags) != 0:
                    for product_url_tag in product_url_tags:
                        product_url = url_format(product_url_tag['href'])
                        line = '{}|{}'.format(hierarchy, product_url)

                    print('{}|{}'.format(hierarchy, current_page))
                    update_files(hierarchy, url_list, current_page,
                                 PRODUCTS_INFO_FILE, COMPLETED_PAGE)
def find_hierarchy(hierarchy, url):

    :param hierarchy: category_hierarchy name
    :param url: current_page_url
    :return: None
    :working: recurssion function to find the hierarchy, last_page and products_page_url
    response = get_content(url)
    if response:
        sub_categories_container = response.find('ul',
                                                 {'class': INDENT_TWO_CLASS})

        if sub_categories_container:
            anchor_tags = sub_categories_container.findAll(
                'a', {'class': NORMAL_ANCHOR_TAG_CLASS})

            # If length of anchor tags is not zero then it contains more categories
            if len(anchor_tags) != 0:
                # Now for each category_url again call find_hierarchy function
                for anchor_tag in anchor_tags:
                    category_name = string_format(anchor_tag)
                    category_url = url_format(anchor_tag['href'])

                    hierarchy_name = '{}|{}'.format(hierarchy, category_name)

                    find_hierarchy(hierarchy_name, category_url)

            # If length of anchor tags is zero then it is the last level of hierarchy
            # Now create directory and save hierarchy and url in a file in that directory
                h4_tag = response.find('h4', {'class': H4_TAG_CLASS})
                if h4_tag:
                    category_name = string_format(h4_tag)

                    category_url_tag = response.find('a',
                                                     {'title': LAYOUT_PICKER})

                    # To get tiles view url
                    if category_url_tag:
                        category_url = url_format(category_url_tag['href'])
                        category_url = url

                    # Get hierarchy name
                    if category_name in hierarchy:
                        hierarchy_name = hierarchy
                        hierarchy_name = '{}|{}'.format(
                            hierarchy, category_name)

                    line = '{}|{}'.format(hierarchy_name, category_url)
                    print line

                    # store the line in a file and create hierarchy directory
                    create_directory_and_hierarchy_files(hierarchy_name, line)
def get_nav_hierarchy(hierarchy, url):

    :param hierarchy:hierarchy name
    :param url: current page url
    :return: none
    :working: Finds the 1st level of hierarchy which contains left_nav_class as a traverse style
              and then adds that links to queue
    response = get_content(url)
    if response:
        nav_container = response.find('div', {'class': LEFT_NAV_CLASS})
        if nav_container:
            nav_string = str(nav_container).split('<h3>')
            for nav in nav_string:
                nav_html = BeautifulSoup(nav, 'lxml')
                if nav_html:

                    category_container = nav_html.find('p')

                    if category_container:
                        main_category_name = string_format(category_container)
                        see_more_tag = nav_html.find("p",
                                                     {"class": SEE_MORE_CLASS})
                        if see_more_tag:
                            if main_category_name in 'Angebote_and_Aktionen|Kindle_Fire_and_Echo|Smartphones_and_mehr|Ratgeber_and_Services':
                                category_url = url_format(
                                hierarchy_name = '{}|{}'.format(
                                    hierarchy, main_category_name)

                                # for current url find the traverse style as it was
                                line = '{}|{}'.format(hierarchy_name,
                                # print line
                            anchor_tags = nav_html.findAll('a')
                            if len(anchor_tags) != 0:
                                for anchor_tag in anchor_tags:
                                    sub_category_name = string_format(
                                    if main_category_name in 'Angebote_and_Aktionen|Kindle_Fire_and_Echo|Smartphones_and_mehr|Ratgeber_and_Services':
                                        hierarchy_name = '{}|{}|{}'.format(
                                            hierarchy, main_category_name,
                                        sub_category_url = url_format(

                                        line = '{}|{}'.format(
                                            hierarchy_name, sub_category_url)
def check_and_get_seller_data(raw_data):
    seller_name_tag = raw_data.find('a')
    if seller_name_tag:
        seller_name = text_format(seller_name_tag)
        seller_link = url_format(seller_name_tag['href'])
        seller_raw_data = response_getter.get_content(seller_link)
        if seller_raw_data:
            return get_seller_info(seller_name, seller_raw_data)
            return seller_name, 'not_available', 'not_available', 'not_available', 'not_available', 'not_available', 'not_available'
        seller_name = text_format(raw_data)
        if seller_name:
            return seller_name, 'not_available', 'not_available', 'not_available', 'not_available', 'not_available', 'not_available'
    return 'not_available', 'not_available', 'not_available', 'not_available', 'not_available', 'not_available', 'not_available'
def find_acs_nav_section(hierarchy, url):

    :param hierarchy: hierarchy name
    :param url: current page url
    :return: None
    :Working: it will find out 1st level of hierarchy after that adds link into queue

    response = get_content(url)
    if response:
        category_container = response.find(
            'div', {'class': 'a-section a-spacing-base'})
        if category_container:
            category_list = category_container.findAll(
                'div', {'class': ACS_SECTION_CLASS})
            if len(category_list) != 0:
                for category in category_list:
                    category_tag = category.find('div',
                                                 {'class': 'acs-ln-links'})
                    if category_tag:
                        category_name = string_format(category_tag)
                        if category_name in IGNORE_LIST:
                            sub_category_links = category.findAll('a')
                            if len(sub_category_links) != 0:
                                for sub_category_link in sub_category_links:
                                    sub_category_name = string_format(
                                    sub_category_url = url_format(

                                    hierarchy_name = '{}|{}|{}'.format(
                                        hierarchy, category_name,

                                    line = '{}|{}'.format(
                                        hierarchy_name, sub_category_url)
                                    if 'Tutto' in hierarchy_name:
def start_program():
    links = [

    for link in links:
        link_list = link.split('|')
        name = '|'.join(link_list[0:-1])
        url = link_list[-1]
        resposne = get_content(url)
        if resposne:
            get_indent_two_hierarchy(name, resposne, url)
def get_correct_data(hierarchy, url):
    :param hierarchy: category hierarchy
    :param url: Current page Url
    :return: valid product details as a tuple

    data = None
    for retires in range(0, CONSTANTS.MAX_RETRIES):
        raw_data = response_getter.get_content(url)
        # Raw_data is beautifulSoup object and it is passed through "get_data" to collect data
        if raw_data:
            # get product information as tuple
            data = get_data(raw_data, hierarchy, url)
            if data:
    return data
def find_traverse_type(hierarchy, url, flag):

    :param hierarchy: category hierarchy
    :param url: current page utl
    :param flag: true or false  if it is true then the function was called for 1st time  if not then it is second time
    :return: None
    response = get_content(url)
    if response:
        category_response_type_1 = response.find('div', {'class': LEFT_NAV_CLASS})
        category_response_type_3 = response.find('ul', {'class': INDENT_TWO_CLASS})

        if category_response_type_3:

            get_indent_two_hierarchy(hierarchy, response, url,flag=flag)

        elif category_response_type_1:

            find_nav_hierarchy(hierarchy, response)
def get_tree_hierarchy(hierarchy, url):
    response = get_content(url)
    if response:
        nav_container = response.find('div', {'class': LEFT_NAV_CLASS})
        if nav_container:
            nav_string = str(nav_container).split('<h3>')
            for nav in nav_string:
                nav_html = BeautifulSoup(nav, 'lxml')
                if nav_html:
                    category_container = nav_html.find('p')
                    if category_container:
                        main_category_name = string_format(category_container)
                        if main_category_name in IGNORE_LIST:
                        elif main_category_name in SELECTED_LIST:
                            get_level_1_see_more_hierarchy(hierarchy, nav_html)
                            # print(main_category_name)
                            get_level_1_hierarchy(hierarchy, nav_html)

def find_traverse_type(hierarchy, url):
    response = get_content(url)
    if response:
        category_response_type_1 = response.find('div',
                                                 {'class': LEFT_NAV_CLASS})
        category_response_type_2 = response.find('ol',
                                                 {'class': CAROUSAL_CLASS})
        category_response_type_3 = response.find('ul',
                                                 {'class': INDENT_TWO_CLASS})

        if category_response_type_3:

            get_indent_two_hierarchy(hierarchy, response, url)

        elif category_response_type_1:

            get_level_1_see_more_hierarchy(hierarchy, response)

        elif category_response_type_2:

            find_carousel_hierarchy(hierarchy, response)
Example #28
def find_nav_hierarchy(hierarchy, url):

    :param hierarchy:hierarchy name
    :param url: current url
    :return: None
    response = get_content(url)
    if response:
        nav_container = response.find('div', {'class': LEFT_NAV_CLASS})
        if nav_container:
            nav_string = str(nav_container).split('<h3>')
            # for all nav_string find see more tag and hirarachy name and
            for nav in nav_string:
                nav_html = BeautifulSoup(nav, 'lxml')
                if nav_html:
                    category_container = nav_html.find('p')
                    if category_container:
                        main_category_name = string_format(category_container)

                        anchor_tags = nav_html.findAll('a')
                        if len(anchor_tags) != 0:
                            for anchor_tag in anchor_tags:
                                sub_category_name = string_format(anchor_tag)
                                if sub_category_name.replace(
                                        main_category_name, '') in IGNORE_LIST:
                                    sub_category_url = url_format(

                                    hierarchy_name = '{}|{}|{}'.format(
                                        hierarchy, main_category_name,
                                    if 'Jouets_par_cate_gorie' in hierarchy_name:
                                        print hierarchy_name
                                            hierarchy_name, sub_category_url,
def get_tree_hierarchy(hierarchy, url):

    :param hierarchy:hierarchy name
    :param url: current page url
    :return: none
    :working: Finds the 1st level of hierarchy which contains left_nav_class as a traverse style
              and then adds that links to queue
    response = get_content(url)
    if response:
        nav_container = response.find('div', {'class': LEFT_NAV_CLASS})
        if nav_container:
            nav_string = str(nav_container).split('<h3>')
            for nav in nav_string:
                nav_html = BeautifulSoup(nav, 'lxml')
                if nav_html:

                    category_container = nav_html.find('p')

                    if category_container:
                        main_category_name = string_format(category_container)
                        anchor_tags = nav_html.findAll('a')
                        if len(anchor_tags) != 0:
                            for anchor_tag in anchor_tags:
                                sub_category_name = string_format(anchor_tag)
                                if 'New_Arrivals' in main_category_name:
                                    hierarchy_name = '{}|{}|{}'.format(
                                        hierarchy, main_category_name,
                                    sub_category_url = url_format(

                                    line = '{}|{}'.format(
                                        hierarchy_name, sub_category_url)
Example #30
def find_hierarchy(hierarchy_name, url):

    :param hierarchy_name:  hierachy names with pipe_delimited format
    :param url: current page url
    :return: None
    response = get_content(url)
    if response:
        sub_category_container = find_sub_category_container(response)

        if sub_category_container:
            h4_tag = sub_category_container.find('h4', {'class': H4_TAG_CLASS})

            # If it contains <h4> tag then it is last level of hierarchy
            if h4_tag:
                anchor_tag = h4_tag.find('a')

                category_url = url_format(anchor_tag['href'])
                line = '{}|{}'.format(hierarchy_name, category_url)

                # stote the line in a file and create hierarchy directory
                create_directory_and_hierarchy_files(hierarchy_name, line)

            # else it contains more categories then find urls and again call this function
                anchor_tags = sub_category_container.findAll('a', {'class': NORMAL_ANCHOR_TAG_CLASS})
                for anchor_tag in anchor_tags:
                    category_name = string_format(anchor_tag)
                    category_url = url_format(anchor_tag['href'])
                    hierarchy = '{}|{}'.format(hierarchy_name, category_name)

                    # recurvisely calling this function
                    find_hierarchy(hierarchy, category_url)
