def get_updated_links(queue_files, completed_files): queue_set = set() competed_set = set() for files in queue_files: count = 1 for url in file_to_set(files): if DataCollectors_Configuration.PRODUCT_INFO_FLAG == CONSTANTS.PRODUCT_FlAG: # load only sample urls mentioned in DataCollectors_Configuration file if count <= DataCollectors_Configuration.NO_OF_PRODUCT_INFO_TO_COLLECT: queue_set.add(url) count = count + 1 else: break else: queue_set.add(url) for files in completed_files: count = 1 for url in file_to_set(files): if DataCollectors_Configuration.PRODUCT_INFO_FLAG == CONSTANTS.PRODUCT_FlAG: # load only sample urls mentioned in DataCollectors_Configuration file if count <= DataCollectors_Configuration.NO_OF_PRODUCT_INFO_TO_COLLECT: competed_set.add(url) count = count + 1 else: break else: competed_set.add(url) final_set = queue_set - competed_set return final_set
def get_product_urls(hierarchy_url): """ :param hierarchy_url: string in this format 'hierarchy|page_url" :return: None :working : requsts the url and then find last page and call's traverse_page function """ name_list = hierarchy_url.split('|') hierarchy_name = '|'.join(name_list[0:-1]) page_url = name_list[-1] urls_list = [] completed_path = '{}{}{}{}{}{}{}'.format(DataCollectors_Configuration.ROOT_FOLDER, DataCollectors_Configuration.PATH_STYLE, DataCollectors_Configuration.AMAZON_CANADA_PROJECT_NAME, DataCollectors_Configuration.PATH_STYLE, hierarchy_name.replace('|', DataCollectors_Configuration.PATH_STYLE), DataCollectors_Configuration.PATH_STYLE, COMPLETED_PAGE) completed_set = file_to_set(completed_path) if in_completed_urls(page_url, completed_set): pass else: response = get_content(page_url) if response: product_url_tags = response.findAll('a', { 'class': 'a-link-normal s-access-detail-page s-color-twister-title-link a-text-normal'}) if len(product_url_tags) != 0: for product_url_tag in product_url_tags: product_url = url_format(product_url_tag['href']) line = '{}|{}'.format(hierarchy_name, product_url) urls_list.append(line) update_files(hierarchy_name, urls_list, page_url, PRODUCTS_INFO_FILE, COMPLETED_PAGE) last_page = find_last_page(response) traverse_pages(hierarchy_name, page_url, last_page, completed_set)
def start_url_collection(folder_name): """ :param folder_name: category_folder name :return: None """ path = '{}{}{}{}{}'.format(DataCollectors_Configuration.ROOT_FOLDER, DataCollectors_Configuration.PATH_STYLE, DataCollectors_Configuration.AMAZON_CANADA_PROJECT_NAME, DataCollectors_Configuration.PATH_STYLE, folder_name) all_files = get_all_files(path, DataCollectors_Configuration.PATTERN_1) for files in all_files: if os.path.exists(files): for url in file_to_set(files): urls_queue.put(url) urls_queue.join()