def collect_all_data(hierarchy, url, last_page, completed_set): """ :param hierarchy: hierarchy name :param url: current page url :param last_page: last page number :param completed_set: completed url sets to compare :return: None :working: collects products url from all pages """ url_list = [] for pageNo in range(2, last_page): current_page = '{}&page={}'.format(url, pageNo) if in_completed_urls(current_page, completed_set): continue else: response = get_content(current_page) if response: product_url_tags = response.findAll( 'a', { 'class': 'a-link-normal s-access-detail-page s-color-twister-title-link a-text-normal' }) if len(product_url_tags) != 0: for product_url_tag in product_url_tags: product_url = url_format(product_url_tag['href']) line = '{}|{}'.format(hierarchy, product_url) url_list.append(line) print('{}|{}'.format(hierarchy, current_page)) update_files(hierarchy, url_list, current_page, PRODUCTS_INFO_FILE, COMPLETED_PAGE)
def get_product_urls(hierarchy_url): """ :param hierarchy_url: string in this format 'hierarchy|page_url" :return: None :working : requsts the url and then find last page and call's traverse_page function """ name_list = hierarchy_url.split('|') hierarchy_name = '|'.join(name_list[0:-1]) page_url = name_list[-1] urls_list = [] completed_path = '{}{}{}{}{}{}{}'.format(DataCollectors_Configuration.ROOT_FOLDER, DataCollectors_Configuration.PATH_STYLE, DataCollectors_Configuration.AMAZON_CANADA_PROJECT_NAME, DataCollectors_Configuration.PATH_STYLE, hierarchy_name.replace('|', DataCollectors_Configuration.PATH_STYLE), DataCollectors_Configuration.PATH_STYLE, COMPLETED_PAGE) completed_set = file_to_set(completed_path) if in_completed_urls(page_url, completed_set): pass else: response = get_content(page_url) if response: product_url_tags = response.findAll('a', { 'class': 'a-link-normal s-access-detail-page s-color-twister-title-link a-text-normal'}) if len(product_url_tags) != 0: for product_url_tag in product_url_tags: product_url = url_format(product_url_tag['href']) line = '{}|{}'.format(hierarchy_name, product_url) urls_list.append(line) update_files(hierarchy_name, urls_list, page_url, PRODUCTS_INFO_FILE, COMPLETED_PAGE) last_page = find_last_page(response) traverse_pages(hierarchy_name, page_url, last_page, completed_set)