def form_hierarchy(level): if level == 1: li = level_Dictionary[1][-1] name_ = li category_name = name_.replace('&', 'and').strip() category_name = re.sub('[^a-zA-Z|]', '_', category_name).replace('__', '').strip() create_project_dir(project_name + '/' + category_name) elif level == 4: li = level_Dictionary[1][-1] li2 = level_Dictionary[4][-1] if li2 in url_dict.keys(): name_ = li + '|' + li2 category_name = name_.replace('&', 'and').strip() category_name = re.sub('[^a-zA-Z|]', '_', category_name).replace('__', '').strip() hierarchy_dict[category_name] = url_dict[li2] create_project_dir(project_name + '/' + category_name.replace('|', '/')) elif level == 6: li = level_Dictionary[1][-1] li2 = level_Dictionary[4][-1] li3 = level_Dictionary[6][-1] if li3 in url_dict.keys(): name_ = li + '|' + li2 + '|' + li3 category_name = name_.replace('&', 'and').strip() category_name = re.sub('[^a-zA-Z|]', '_', category_name).replace('__', '').strip() hierarchy_dict[category_name] = url_dict[li3] create_project_dir(project_name + '/' + category_name.replace('|', '/')) elif level == 7: li = level_Dictionary[1][-1] li2 = level_Dictionary[4][-1] li3 = level_Dictionary[7][-1] if li3 in url_dict.keys(): name_ = li + '|' + li2 + '|' + li3 category_name = name_.replace('&', 'and').strip() category_name = re.sub('[^a-zA-Z|]', '_', category_name).replace('__', '').strip() hierarchy_dict[category_name] = url_dict[li3] create_project_dir(project_name + '/' + category_name.replace('|', '/')) elif level == 8: li = level_Dictionary[1][-1] li2 = level_Dictionary[4][-1] li3 = level_Dictionary[6][-1] li4 = level_Dictionary[8][-1] if li4 in url_dict.keys(): name_ = li + '|' + li2 + '|' + li3 + '|' + li4 category_name = name_.replace('&', 'and').strip() category_name = re.sub('[^a-zA-Z|]', '_', category_name).replace('__', '').strip() hierarchy_dict[category_name] = url_dict[li4] create_project_dir(project_name + '/' + category_name.replace('|', '/'))
def create_hirerachy(starting_url): """ :param starting_url: Give staring url :working: It will collect all urls with hierarchy and creates directories as present on website and writ into files :return: does not return anything """ create_project_dir(PROJECT_NAME) category_hierarchy_and_urls = collect_main_page_urls(starting_url) urls_list = [] # To create directories with hierarchy for category_name in sorted(category_hierarchy_and_urls.keys()): category_path = '{}/{}/{}'.format(ROOT_FOLDER,PROJECT_NAME, category_name.replace('|', '/')) # create_project_dir(category_path) # This will add dictionary elements into a list for key in category_hierarchy_and_urls.keys(): line = '{}|{}'.format(key, category_hierarchy_and_urls[key]) urls_list.append(line) # This function will write urls present in list to file list_to_file('queue_links', urls_list)
def collect_hirerachy_details(): create_project_dir(project_name) page_soup, recivied_url = get_page_soup( DataCollectors_Configuration.SOUQ_MAIN_URL) columns_container = page_soup.find_all("div", {'class': 'large-4 columns'}) # To split html page into different blocks so that we can get main category name categories_container = [] for columns in columns_container: category_data = str(columns).split('<h3 class="shop-all-title">') for data in category_data: data = '<h3>' + data categories_container.append(data) # This method forms dictionary of last level sub_category as a key and its url form_url_dict(page_soup) # pasrse the categories container and perform depth first search then form hirerachy for i in categories_container: categories_block_container = BeautifulSoup(i, "html.parser") dfs(categories_block_container, 0, level_Dictionary) # Store hirerachy ditionary into file write_hirerachy_file(hierarchy_dict)