Esempio n. 1
0
    async def async_handler(url_list, workers, try_count, delay, return_bool):
        """
        make tasks and run them in a queue
        :return dict of {url: html_page}
        """
        logger.debug(f'input urls for download are: {len(url_list)}')
        url_list = list(set(url_list))

        logger.debug(
            f'len the url_list after delete duplicates = {len(url_list)}')

        urls_splited = split_list(url_list, workers)

        responses = {}

        for urls in urls_splited:
            tasks = [
                asyncio.ensure_future(
                    single_page_downloader(url, try_count, delay))
                for url in urls
            ]
            res_list = await asyncio.gather(*tasks)
            if return_bool:
                responses.update({
                    list(res.keys())[0]: list(res.values())[0]
                    for res in res_list
                })

        return responses if return_bool else None
Esempio n. 2
0
def collect_data_id_from_resource(pages, base, patterns):
    """
    general finding ids from list pages

    :param pages:
    :param base:
    :param patterns:
    :return:
    """

    logger.info(f'start collecting ids from {base}')
    new_ids = []

    pages_compressed_html = download_pages(pages)
    for page in pages:
        logger.debug(f'collecting ids from {page}')

        souped_page = soup(compressed_to_str(pages_compressed_html.pop(page)),
                           features='lxml')

        for pattern in patterns:
            new_pages = [
                tag['href'] for tag in souped_page.find_all(
                    'a', {'href': re.compile(f'({base})?{pattern}')})
            ]

            new_pages = [
                base + page if page.find('http') == -1 else page
                for page in new_pages
            ]

            new_pages = [
                page for page in new_pages if page[5:].find('http') == -1
            ]

            new_pages = [re.sub(r'/?\?.*', '', page) for page in new_pages]

            new_ids += [
                re.search(f'{base}{pattern}', page).group(1)
                for page in new_pages
            ]

    return new_ids
Esempio n. 3
0
    async def single_page_downloader(url, try_count, delay):
        """
        download one page by send get request to the url
        save the page and return it as string
        """

        file_address = get_guessed_file_address(url)

        try:
            output = {url: load_compressed_object(file_address)}
            logger.debug(f'already downloaded {url}')

            return output if return_bool else None

        except FileNotFoundError as error:
            logger.debug(f'start downloading {url}')

        for i in range(try_count):
            try:
                async with aiohttp.ClientSession(
                        connector=aiohttp.TCPConnector()) as session:
                    async with session.get(url) as resp:
                        site_html = await resp.text()

                        compressed_html = str_to_compressed(site_html)
                        save_compressed_object(file_address, compressed_html)

                        output = {url: compressed_html}
                        return output if return_bool else None

            except Exception as error:
                logger.error(
                    f'try_time: {i}/{try_count}, when downloading {url}: {error}'
                )
                await asyncio.sleep(delay)

        # urls that not downloaded
        # comes to here
        logger.error(
            f'download FAILED! , could not get the page after {try_count} times of trying!'
        )
Esempio n. 4
0
def make_soup(url):
    """
    get the BeautifulSoup object of this page

    :param
    url (str): the url of page that we want

    :returns
    BeautifulSoup object: content of page of given url
    """
    """
    1. load the page
        for new urls:
            download the html and save it as html file in downloaded pages

        for old urls:
            loads html for them from files to memory

    2. return page as soup object

    """

    if isinstance(url, list):
        raise MemoryError(
            'to avoid memory overflow please use download_pages function for download list of pages'
        )

    file_address = get_guessed_file_address(url)

    if os.path.isfile(file_address):
        logger.debug(f'already downloaded {url}')
        page_html = compressed_to_str(load_compressed_object(file_address))
    else:
        logger.debug(f'start downloading {url}')
        page_html = get_page(url)
        save_compressed_object(str_to_compressed(page_html))

    return soup(page_html, features='lxml')
def arg_parse():
    '''
    command line interface
    '''

    parser = argparse.ArgumentParser(description='Process some integers.')

    parser.add_argument(
        '-r',
        '--run',
        dest='function',
        default=None,
        type=str,
        help='name of function to run',
    )

    parser.add_argument(
        '-db',
        '--db_name',
        dest='db',
        default=None,
        type=str,
        help='name of dataset',
    )

    parser.add_argument(
        '-log',
        '--log_level',
        dest='log_level',
        default=None,
        type=str,
        help='level of log',
    )

    args = parser.parse_args()

    if args.function:
        dataset = dbManager.dataset(args.db)
        logger.debug(f'runing arg with args.function = {args.function}')

        if args.log_level:
            log_level = args.log_level
            if log_level == 'debug':
                logger.setLevel(logging.DEBUG)

            elif log_level == 'info':
                logger.setLevel(logging.INFO)

            elif log_level == 'error':
                logger.setLevel(logging.ERROR)

            elif log_level == 'critical':
                logger.setLevel(logging.CRITICAL)

            else:
                logger.setLevel(logging.WARNING)

        if args.function in ['st', 'start']:
            dataset.start()

        if args.function in ['dr', 'download_resource']:
            dataset.download_resources()

        elif args.function in ['ip', 'init_project']:
            dbManager.init_project()

        elif args.function in ['fd', 'find_db']:
            dataset.find_ids()

        elif args.function in ['ud', 'update_db']:
            dataset.update()

        elif args.function in ['sct', 'schema_test']:
            dataset.schema_test()

    # if there is any arg, return True
    if (len(sys.argv) == 1) or (len(sys.argv) == 2 and sys.argv[1] == '-log'):
        return False
    else:
        return True
Esempio n. 6
0
def download_pages(url_list,
                   workers=50,
                   try_count=10,
                   delay=1,
                   return_bool=True):
    """
    download a list of the urls and save them if you want

    :param url_list: list of urls that we want to download
    :param workers:
    :param try_count:
    :param delay:
    :param return_bool:
    :return: list of responses
    """
    def split_list(input_list, step):
        return [
            input_list[i - step:i] for i in range(step,
                                                  len(input_list) + step, step)
        ]

    async def single_page_downloader(url, try_count, delay):
        """
        download one page by send get request to the url
        save the page and return it as string
        """

        file_address = get_guessed_file_address(url)

        try:
            output = {url: load_compressed_object(file_address)}
            logger.debug(f'already downloaded {url}')

            return output if return_bool else None

        except FileNotFoundError as error:
            logger.debug(f'start downloading {url}')

        for i in range(try_count):
            try:
                async with aiohttp.ClientSession(
                        connector=aiohttp.TCPConnector()) as session:
                    async with session.get(url) as resp:
                        site_html = await resp.text()

                        compressed_html = str_to_compressed(site_html)
                        save_compressed_object(file_address, compressed_html)

                        output = {url: compressed_html}
                        return output if return_bool else None

            except Exception as error:
                logger.error(
                    f'try_time: {i}/{try_count}, when downloading {url}: {error}'
                )
                await asyncio.sleep(delay)

        # urls that not downloaded
        # comes to here
        logger.error(
            f'download FAILED! , could not get the page after {try_count} times of trying!'
        )

    async def async_handler(url_list, workers, try_count, delay, return_bool):
        """
        make tasks and run them in a queue
        :return dict of {url: html_page}
        """
        logger.debug(f'input urls for download are: {len(url_list)}')
        url_list = list(set(url_list))

        logger.debug(
            f'len the url_list after delete duplicates = {len(url_list)}')

        urls_splited = split_list(url_list, workers)

        responses = {}

        for urls in urls_splited:
            tasks = [
                asyncio.ensure_future(
                    single_page_downloader(url, try_count, delay))
                for url in urls
            ]
            res_list = await asyncio.gather(*tasks)
            if return_bool:
                responses.update({
                    list(res.keys())[0]: list(res.values())[0]
                    for res in res_list
                })

        return responses if return_bool else None

    # main function

    logger.debug(
        f'start make_soup for url = '
        f'{url_list if len(url_list) < 2 else str(url_list[:2]).replace("]", ", ...]")} len={len(url_list)}'
    )

    loop = asyncio.new_event_loop()
    task = loop.create_task(
        async_handler(url_list, workers, try_count, delay, return_bool))
    response = loop.run_until_complete(task)
    loop.close()

    return response if return_bool else None