Example #1
0
def parse_until_complete(urls, chunksize):
    """
        Parse urls.
        Repeat until some urls are parsed.
    """
    start = time()
    global error_on_urls
    attempts = 10
    urls = list(filter_urls(set(urls), CSV_FILE_NAME))
    print(f'Read urls from file: {CSV_FILE_NAME}')
    print(f'URLs for parsing: {len(urls)}')

    # Repeat if some urls has not be parsed
    for i in range(attempts):
        if i > 0:
            urls = list(error_on_urls)
            error_on_urls = set()  # reset set
        run_iter(urls[:chunksize])

        if not error_on_urls:
            break

    # Save parsed data
    try:
        save_csv(result, CSV_FILE_NAME, update=True)
    except Exception as e:
        print(e)
    finally:
        if error_on_urls:
            write_file('data/error_on_urls.txt', error_on_urls, mode='w')

    print(f'\nTotal time: {time() - start}')
    print(f'Success completed: {len(result)} urls')
    print(f'Errors: {error_on_urls.__len__()}')
Example #2
0
def download():
    """Download skipped images"""
    not_existing_files = all_items_images() - existing_files()

    for url in not_existing_files:
        print(f'Download file: {url}')
        data = get_img(url)
        write_file(url, data, mode='wb')
Example #3
0
async def _get_data_from(url):
    try:
        img = await get_response(url)
        if img:
            write_file(url, img, mode='wb')

    except Exception as e:
        print(f'Except on parse data: {e}. URL: {url}')
        traceback.print_tb(e.__traceback__)
        return
Example #4
0
def get_all_menu_links(url: str) -> set:
    """
        Get all urls from catalog menu
        (Categories/subcategories urls)
    """
    from parser_lib import get_url_data, normalize, read_html, write_file
    html = get_url_data(url)
    html = read_html(html)
    page_items = html.select('#nav-catalog a')

    urls = set(normalize(url.attrs.get('href')) for url in page_items)

    write_file('data/categories_urls.txt', urls, mode='w')

    return urls
Example #5
0
async def get_data_from(url):
    data = await get_response(url)
    try:
        if data:
            parsed_data = parse_item(url, data)
            result.append(parsed_data)
            completed_urls.add(url)

            # Get image
            img_url = parsed_data.get('img_url')[1:]
            if img_url:
                img = await get_response(img_url)
                if img:
                    write_file(img_url, img, mode='wb')

    except Exception as e:
        print(f'Except on parse data: {e}. URL: {url}')
        traceback.print_tb(e.__traceback__)
        return
Example #6
0
def get_all_items_urls() -> set:
    """
        Get full url list for each item on site.
        Write result to files by subcategories
        and return flat list of urls
    """
    from parser_lib import write_file, all_subcategory_items, HOST
    chunks = list()
    all_items_file = 'data/all_item_urls_flat.txt'

    if os.path.exists(all_items_file):
        with open(all_items_file, 'r') as f:
            urls = set(u.strip() for u in f)
        if urls:
            return urls

    if not os.path.exists('data'):
        os.mkdir('data')

    urls = get_all_menu_links(HOST)
    for url in urls:
        print(url)
        chunks.append(all_subcategory_items(url))

    urls_brands = get_all_menu_links_by_brands(HOST)
    for url in urls_brands:
        print(url)
        chunks.append(all_subcategory_items(url, '_brands'))

    # Write items urls to file
    urls = tuple(chain(*chunks))
    print(f'total urls: {len(urls)}')
    urls = set(urls)
    print(f'total unique urls: {len(urls)}')

    write_file(all_items_file, urls)

    return urls