def parse_until_complete(urls, chunksize): """ Parse urls. Repeat until some urls are parsed. """ start = time() global error_on_urls attempts = 10 urls = list(filter_urls(set(urls), CSV_FILE_NAME)) print(f'Read urls from file: {CSV_FILE_NAME}') print(f'URLs for parsing: {len(urls)}') # Repeat if some urls has not be parsed for i in range(attempts): if i > 0: urls = list(error_on_urls) error_on_urls = set() # reset set run_iter(urls[:chunksize]) if not error_on_urls: break # Save parsed data try: save_csv(result, CSV_FILE_NAME, update=True) except Exception as e: print(e) finally: if error_on_urls: write_file('data/error_on_urls.txt', error_on_urls, mode='w') print(f'\nTotal time: {time() - start}') print(f'Success completed: {len(result)} urls') print(f'Errors: {error_on_urls.__len__()}')
def download(): """Download skipped images""" not_existing_files = all_items_images() - existing_files() for url in not_existing_files: print(f'Download file: {url}') data = get_img(url) write_file(url, data, mode='wb')
async def _get_data_from(url): try: img = await get_response(url) if img: write_file(url, img, mode='wb') except Exception as e: print(f'Except on parse data: {e}. URL: {url}') traceback.print_tb(e.__traceback__) return
def get_all_menu_links(url: str) -> set: """ Get all urls from catalog menu (Categories/subcategories urls) """ from parser_lib import get_url_data, normalize, read_html, write_file html = get_url_data(url) html = read_html(html) page_items = html.select('#nav-catalog a') urls = set(normalize(url.attrs.get('href')) for url in page_items) write_file('data/categories_urls.txt', urls, mode='w') return urls
async def get_data_from(url): data = await get_response(url) try: if data: parsed_data = parse_item(url, data) result.append(parsed_data) completed_urls.add(url) # Get image img_url = parsed_data.get('img_url')[1:] if img_url: img = await get_response(img_url) if img: write_file(img_url, img, mode='wb') except Exception as e: print(f'Except on parse data: {e}. URL: {url}') traceback.print_tb(e.__traceback__) return
def get_all_items_urls() -> set: """ Get full url list for each item on site. Write result to files by subcategories and return flat list of urls """ from parser_lib import write_file, all_subcategory_items, HOST chunks = list() all_items_file = 'data/all_item_urls_flat.txt' if os.path.exists(all_items_file): with open(all_items_file, 'r') as f: urls = set(u.strip() for u in f) if urls: return urls if not os.path.exists('data'): os.mkdir('data') urls = get_all_menu_links(HOST) for url in urls: print(url) chunks.append(all_subcategory_items(url)) urls_brands = get_all_menu_links_by_brands(HOST) for url in urls_brands: print(url) chunks.append(all_subcategory_items(url, '_brands')) # Write items urls to file urls = tuple(chain(*chunks)) print(f'total urls: {len(urls)}') urls = set(urls) print(f'total unique urls: {len(urls)}') write_file(all_items_file, urls) return urls