Python AsyncScrapeScheduler.runの例

プログラミング言語: Python

名前空間/パッケージ名: GoogleScraper.async_mode

メソッド/関数: run

hotexamples.comのコード掲載数: 4

Python AsyncScrapeScheduler.run - 4件のコード例が見つかりました。すべてオープンソースプロジェクトから抽出されたPythonのGoogleScraper.async_mode.AsyncScrapeScheduler.runの実例で、最も評価が高いものを厳選しています。コード例の評価を行っていただくことで、より質の高いコード例が表示されるようになります。

よく使われるメソッド

表示非表示

AsyncScrapeScheduler(3)

run(3)

よく使われるメソッド

AsyncScrapeScheduler (3)

run (3)

コード例 #1

ファイルを表示

ファイル: core.py プロジェクト: magicknight/GoogleScraper

def main(return_results=False, parse_cmd_line=True, config_from_dict=None):
    """Runs the GoogleScraper application as determined by the various configuration points.

    The main() function encompasses the core functionality of GoogleScraper. But it
    shouldn't be the main() functions job to check the validity of the provided
    configuration.

    Args:
        return_results: When GoogleScrape is used from within another program, don't print results to stdout,
                        store them in a database instead.
        parse_cmd_line: Whether to get options from the command line or not.
        config_from_dict: Configuration that is passed when GoogleScraper is called as library.
    Returns:
        A database session to the results when return_results is True. Else, nothing.
    """
    external_config_file_path = cmd_line_args = None

    if parse_cmd_line:
        cmd_line_args = get_command_line()

        if cmd_line_args.get('config_file', None):
            external_config_file_path = os.path.abspath(cmd_line_args.get('config_file'))

    config = get_config(cmd_line_args, external_config_file_path, config_from_dict)

    if isinstance(config['log_level'], int):
        config['log_level'] = logging.getLevelName(config['log_level'])

    setup_logger(level=config.get('log_level').upper())

    if config.get('view_config', False):
        print(open(os.path.join(get_base_path(), 'scrape_config.py')).read())
        return

    if config.get('version'):
        from GoogleScraper.version import __version__
        print(__version__)
        return

    if config.get('clean', False):
        try:
            os.remove('google_scraper.db')
            if sys.platform == 'linux':
                os.system('rm {}/*'.format(config.get('cachedir')))
        except:
            pass
        return

    init_outfile(config, force_reload=True)

    kwfile = config.get('keyword_file', '')
    if kwfile:
        kwfile = os.path.abspath(kwfile)

    keyword = config.get('keyword')
    keywords = set(config.get('keywords', []))
    proxy_file = config.get('proxy_file', '')
    proxy_db = config.get('mysql_proxy_db', '')
    proxy_list = config.get('proxy_list', [])

    # when no search engine is specified, use google
    search_engines = config.get('search_engines', ['google',])
    if not isinstance(search_engines, list):
        if search_engines == '*':
            search_engines = config.get('supported_search_engines')
        else:
            search_engines = search_engines.split(',')

    assert isinstance(search_engines, list), 'Search engines must be a list like data type!'
    search_engines = set(search_engines)

    num_search_engines = len(search_engines)
    num_workers = int(config.get('num_workers'))
    scrape_method = config.get('scrape_method')
    pages = int(config.get('num_pages_for_keyword', 1))
    method = config.get('scrape_method', 'http')

    if config.get('shell', False):
        namespace = {}
        session_cls = get_session(config, scoped=False)
        namespace['session'] = session_cls()
        namespace['ScraperSearch'] = ScraperSearch
        namespace['SERP'] = SERP
        namespace['Link'] = Link
        namespace['Proxy'] = GoogleScraper.database.Proxy
        print('Available objects:')
        print('session - A sqlalchemy session of the results database')
        print('ScraperSearch - Search/Scrape job instances')
        print('SERP - A search engine results page')
        print('Link - A single link belonging to a SERP')
        print('Proxy - Proxies stored for scraping projects.')
        start_python_console(namespace)
        return

    if not (keyword or keywords) and not kwfile:
        # Just print the help.
        get_command_line(True)
        print('No keywords to scrape for. Please provide either an keyword file (Option: --keyword-file) or specify and '
            'keyword with --keyword.')
        return

    cache_manager = CacheManager(config)

    if config.get('fix_cache_names'):
        cache_manager.fix_broken_cache_names()
        logger.info('renaming done. restart for normal use.')
        return

    keywords = [keyword, ] if keyword else keywords
    scrape_jobs = {}
    if kwfile:
        if not os.path.exists(kwfile):
            raise WrongConfigurationError('The keyword file {} does not exist.'.format(kwfile))
        else:
            if kwfile.endswith('.py'):
                # we need to import the variable "scrape_jobs" from the module.
                sys.path.append(os.path.dirname(kwfile))
                try:
                    modname = os.path.split(kwfile)[-1].rstrip('.py')
                    scrape_jobs = getattr(__import__(modname, fromlist=['scrape_jobs']), 'scrape_jobs')
                except ImportError as e:
                    logger.warning(e)
            else:
                # Clean the keywords of duplicates right in the beginning
                keywords = set([line.strip() for line in open(kwfile, 'r').read().split('\n') if line.strip()])

    if not scrape_jobs:
        scrape_jobs = default_scrape_jobs_for_keywords(keywords, search_engines, scrape_method, pages)

    scrape_jobs = list(scrape_jobs)

    if config.get('clean_cache_files', False):
        cache_manager.clean_cachefiles()
        return

    if config.get('check_oto', False):
        cache_manager._caching_is_one_to_one(keyword)

    if config.get('num_results_per_page') > 100:
        raise WrongConfigurationError('Not more that 100 results per page available for searches.')

    proxies = []

    if proxy_list:
        proxies = proxy_list
    elif proxy_db:
        proxies = get_proxies_from_mysql_db(proxy_db)
    elif proxy_file:
        proxies = parse_proxy_file(proxy_file)

    if config.get('use_own_ip'):
        proxies.append(None)     

    if not proxies:
        raise Exception('No proxies available and using own IP is prohibited by configuration. Turning down.')

    valid_search_types = ('normal', 'video', 'news', 'image')
    if config.get('search_type') not in valid_search_types:
        raise WrongConfigurationError('Invalid search type! Select one of {}'.format(repr(valid_search_types)))

    if config.get('simulate', False):
        print('*' * 60 + 'SIMULATION' + '*' * 60)
        logger.info('If GoogleScraper would have been run without the --simulate flag, it would have:')
        logger.info('Scraped for {} keywords, with {} results a page, in total {} pages for each keyword'.format(
            len(keywords), int(config.get('num_results_per_page', 0)),
            int(config.get('num_pages_for_keyword'))))
        if None in proxies:
            logger.info('Also using own ip address to scrape.')
        else:
            logger.info('Not scraping with own ip address.')
        logger.info('Used {} unique ip addresses in total'.format(len(proxies)))
        if proxies:
            logger.info('The following proxies are used: \n\t\t{}'.format(
                '\n\t\t'.join([proxy.host + ':' + proxy.port for proxy in proxies if proxy])))

        logger.info('By using {} mode with {} worker instances'.format(config.get('scrape_method'),
                                                                       int(config.get('num_workers'))))
        return

    # get a scoped sqlalchemy session
    session_cls = get_session(config, scoped=False)
    session = session_cls()

    # add fixtures
    fixtures(config, session)

    # add proxies to the database
    add_proxies_to_db(proxies, session)

    # ask the user to continue the last scrape. We detect a continuation of a
    # previously established scrape, if the keyword-file is the same and unmodified since
    # the beginning of the last scrape.
    scraper_search = None
    if kwfile and config.get('continue_last_scrape', False):
        searches = session.query(ScraperSearch). \
            filter(ScraperSearch.keyword_file == kwfile). \
            order_by(ScraperSearch.started_searching). \
            all()

        if searches:
            last_search = searches[-1]
            last_modified = datetime.datetime.utcfromtimestamp(os.path.getmtime(last_search.keyword_file))

            # if the last modification is older then the starting of the search
            if last_modified < last_search.started_searching:
                scraper_search = last_search
                logger.info('Continuing last scrape.')

    if not scraper_search:
        scraper_search = ScraperSearch(
            keyword_file=kwfile,
            number_search_engines_used=num_search_engines,
            number_proxies_used=len(proxies),
            number_search_queries=len(keywords),
            started_searching=datetime.datetime.utcnow(),
            used_search_engines=','.join(search_engines)
        )

    # First of all, lets see how many requests remain to issue after searching the cache.
    if config.get('do_caching'):
        scrape_jobs = cache_manager.parse_all_cached_files(scrape_jobs, session, scraper_search)

    if scrape_jobs:

        # Create a lock to synchronize database access in the sqlalchemy session
        db_lock = threading.Lock()

        # create a lock to cache results
        cache_lock = threading.Lock()

        # A lock to prevent multiple threads from solving captcha, used in selenium instances.
        captcha_lock = threading.Lock()

        logger.info('Going to scrape {num_keywords} keywords with {num_proxies} proxies by using {num_threads} threads.'.format(
            num_keywords=len(list(scrape_jobs)),
            num_proxies=len(proxies),
            num_threads=num_search_engines))

        progress_thread = None

        # Let the games begin
        if method in ('selenium', 'http'):

            # Show the progress of the scraping
            q = queue.Queue()
            progress_thread = ShowProgressQueue(config, q, len(scrape_jobs))
            progress_thread.start()

            workers = queue.Queue()
            num_worker = 0
            for search_engine in search_engines:

                for proxy in proxies:

                    for worker in range(num_workers):
                        num_worker += 1
                        workers.put(
                            ScrapeWorkerFactory(
                                config,
                                cache_manager=cache_manager,
                                mode=method,
                                proxy=proxy,
                                search_engine=search_engine,
                                session=session,
                                db_lock=db_lock,
                                cache_lock=cache_lock,
                                scraper_search=scraper_search,
                                captcha_lock=captcha_lock,
                                progress_queue=q,
                                browser_num=num_worker
                            )
                        )

            # here we look for suitable workers
            # for all jobs created.
            for job in scrape_jobs:
                while True:
                    worker = workers.get()
                    workers.put(worker)
                    if worker.is_suitabe(job):
                        worker.add_job(job)
                        break

            threads = []

            while not workers.empty():
                worker = workers.get()
                thread = worker.get_worker()
                if thread:
                    threads.append(thread)

            for t in threads:
                t.start()

            for t in threads:
                t.join()

            # after threads are done, stop the progress queue.
            q.put('done')
            progress_thread.join()

        elif method == 'http-async':
            scheduler = AsyncScrapeScheduler(config, scrape_jobs, cache_manager=cache_manager, session=session, scraper_search=scraper_search,
                                             db_lock=db_lock)
            scheduler.run()

        else:
            raise Exception('No such scrape_method {}'.format(config.get('scrape_method')))

    from GoogleScraper.output_converter import close_outfile
    close_outfile()

    scraper_search.stopped_searching = datetime.datetime.utcnow()
    session.add(scraper_search)
    session.commit()

    if return_results:
        return scraper_search

コード例 #2

ファイルを表示

ファイル: core.py プロジェクト: tcapilla/GoogleScraper

def main(return_results=False, parse_cmd_line=True):
    """Runs the GoogleScraper application as determined by the various configuration points.

    The main() function encompasses the core functionality of GoogleScraper. But it
    shouldn't be the main() functions job to check the validity of the provided
    configuration.

    Args:
        return_results: When GoogleScrape is used from within another program, don't print results to stdout,
                        store them in a database instead.
        parse_cmd_line: Whether to get options from the command line or not.
    Returns:
        A database session to the results when return_results is True
    """
    if parse_cmd_line:
        parse_cmd_args()

    # If the configuration file to use is explicitly specified, update the current configuration
    # with it.
    if Config['GLOBAL'].get('config_file', None):
        update_config_with_file(Config['GLOBAL'].get('config_file', None))

    if Config['GLOBAL'].getboolean('view_config'):
        from GoogleScraper.config import CONFIG_FILE

        print(open(CONFIG_FILE).read())
        return

    if Config['GLOBAL'].getboolean('version'):
        from GoogleScraper.version import __version__

        print(__version__)
        return

    if Config['GLOBAL'].getboolean('clean', False):
        try:
            os.remove('google_scraper.db')
            if sys.platform == 'linux':
                os.system('rm {}/*'.format(Config['GLOBAL'].get('cachedir')))
        except:
            pass
        return

    init_outfile(force_reload=True)

    kwfile = Config['SCRAPING'].get('keyword_file', '')
    if kwfile:
        kwfile = os.path.abspath(kwfile)

    keyword = Config['SCRAPING'].get('keyword')
    keywords = {
        keyword
        for keyword in set(Config['SCRAPING'].get('keywords', []).split('\n'))
        if keyword
    }
    proxy_file = Config['GLOBAL'].get('proxy_file', '')
    proxy_db = Config['GLOBAL'].get('mysql_proxy_db', '')
    proxy_tuples = Config['GLOBAL'].get('proxy_tuples', [])

    se = Config['SCRAPING'].get('search_engines', 'google')
    if se.strip() == '*':
        se = Config['SCRAPING'].get('supported_search_engines', 'google')

    search_engines = list({
        search_engine.strip()
        for search_engine in se.split(',') if search_engine.strip()
    })
    assert search_engines, 'No search engine specified'
    num_search_engines = len(search_engines)
    num_workers = Config['SCRAPING'].getint('num_workers')
    scrape_method = Config['SCRAPING'].get('scrape_method')
    pages = Config['SCRAPING'].getint('num_pages_for_keyword', 1)
    method = Config['SCRAPING'].get('scrape_method', 'http')

    if Config['GLOBAL'].getboolean('shell', False):
        namespace = {}
        session_cls = get_session(scoped=False)
        namespace['session'] = session_cls()
        namespace['ScraperSearch'] = ScraperSearch
        namespace['SERP'] = SERP
        namespace['Link'] = Link
        namespace['Proxy'] = GoogleScraper.database.Proxy
        print('Available objects:')
        print('session - A sqlalchemy session of the results database')
        print('ScraperSearch - Search/Scrape job instances')
        print('SERP - A search engine results page')
        print('Link - A single link belonging to a SERP')
        print('Proxy - Proxies stored for scraping projects.')
        start_python_console(namespace)
        return

    if not (keyword or keywords) and not kwfile:
        raise_or_log(
            'No keywords to scrape for. Please provide either an keyword file (Option: --keyword-file) or specify and '
            'keyword with --keyword.')
        # Just print the help.
        get_command_line(True)
        return

    if Config['GLOBAL'].getboolean('fix_cache_names'):
        fix_broken_cache_names()
        logger.info('renaming done. restart for normal use.')
        return

    keywords = [
        keyword,
    ] if keyword else keywords
    scrape_jobs = {}
    if kwfile:
        if not os.path.exists(kwfile):
            raise_or_log('The keyword file {} does not exist.'.format(kwfile),
                         exception_obj=InvalidConfigurationException)
        else:
            if kwfile.endswith('.py'):
                # we need to import the variable "scrape_jobs" from the module.
                sys.path.append(os.path.dirname(kwfile))
                try:
                    modname = os.path.split(kwfile)[-1].rstrip('.py')
                    scrape_jobs = getattr(
                        __import__(modname, fromlist=['scrape_jobs']),
                        'scrape_jobs')
                except ImportError as e:
                    logger.warning(e)
            else:
                # Clean the keywords of duplicates right in the beginning
                keywords = set([
                    line.strip()
                    for line in open(kwfile, 'r').read().split('\n')
                    if line.strip()
                ])

    if not scrape_jobs:
        scrape_jobs = default_scrape_jobs_for_keywords(keywords,
                                                       search_engines,
                                                       scrape_method, pages)

    scrape_jobs = list(scrape_jobs)

    if Config['GLOBAL'].getboolean('clean_cache_files', False):
        clean_cachefiles()
        return

    if Config['GLOBAL'].getboolean('check_oto', False):
        _caching_is_one_to_one(keyword)

    if Config['SCRAPING'].getint('num_results_per_page') > 100:
        raise_or_log(
            'Not more that 100 results per page available for searches.',
            exception_obj=InvalidConfigurationException)

    proxies = []

    if proxy_tuples:
        proxies = tuples_to_proxies(proxy_tuples)
    elif proxy_db:
        proxies = get_proxies_from_mysql_db(proxy_db)
    elif proxy_file:
        proxies = parse_proxy_file(proxy_file)

    # Randomize proxies
    shuffle(proxies)

    if Config['SCRAPING'].getboolean('use_own_ip'):
        proxies.append(None)

    if not proxies:
        raise InvalidConfigurationException(
            'No proxies available and using own IP is prohibited by configuration. Turning down.'
        )

    valid_search_types = ('normal', 'video', 'news', 'image')
    if Config['SCRAPING'].get('search_type') not in valid_search_types:
        raise_or_log('Invalid search type! Select one of {}'.format(
            repr(valid_search_types)),
                     exception_obj=InvalidConfigurationException)

    if Config['GLOBAL'].getboolean('simulate', False):
        print('*' * 60 + 'SIMULATION' + '*' * 60)
        logger.info(
            'If GoogleScraper would have been run without the --simulate flag, it would have:'
        )
        logger.info(
            'Scraped for {} keywords, with {} results a page, in total {} pages for each keyword'
            .format(len(keywords),
                    Config['SCRAPING'].getint('num_results_per_page', 0),
                    Config['SCRAPING'].getint('num_pages_for_keyword')))
        if None in proxies:
            logger.info('Also using own ip address to scrape.')
        else:
            logger.info('Not scraping with own ip address.')
        logger.info('Used {} unique ip addresses in total'.format(
            len(proxies)))
        if proxies:
            logger.info('The following proxies are used: \n\t\t{}'.format(
                '\n\t\t'.join([
                    proxy.host + ':' + proxy.port for proxy in proxies if proxy
                ])))

        logger.info('By using {} mode with {} worker instances'.format(
            Config['SCRAPING'].get('scrape_method'),
            Config['SCRAPING'].getint('num_workers')))
        return

    # get a scoped sqlalchemy session
    session_cls = get_session(scoped=False)
    session = session_cls()

    # add fixtures
    fixtures(session)

    # add proxies to the database
    add_proxies_to_db(proxies, session)

    # ask the user to continue the last scrape. We detect a continuation of a
    # previously established scrape, if the keyword-file is the same and unmodified since
    # the beginning of the last scrape.
    scraper_search = None
    if kwfile and Config['GLOBAL'].getboolean('continue_last_scrape', False):
        searches = session.query(ScraperSearch). \
            filter(ScraperSearch.keyword_file == kwfile). \
            order_by(ScraperSearch.started_searching). \
            all()

        if searches:
            last_search = searches[-1]
            last_modified = datetime.datetime.utcfromtimestamp(
                os.path.getmtime(last_search.keyword_file))

            # if the last modification is older then the starting of the search
            if last_modified < last_search.started_searching:
                scraper_search = last_search
                logger.info('Continuing last scrape.')

    if not scraper_search:
        scraper_search = ScraperSearch(
            id=generate_id(),
            keyword_file=os.path.abspath(kwfile),
            number_search_engines_used=num_search_engines,
            number_proxies_used=len(proxies),
            number_search_queries=len(keywords),
            started_searching=datetime.datetime.utcnow(),
            used_search_engines=','.join(search_engines))

    # First of all, lets see how many requests remain to issue after searching the cache.
    if Config['GLOBAL'].getboolean('do_caching'):
        scrape_jobs = parse_all_cached_files(scrape_jobs, session,
                                             scraper_search)

    if scrape_jobs:

        # Create a lock to synchronize database access in the sqlalchemy session
        db_lock = threading.Lock()

        # create a lock to cache results
        cache_lock = threading.Lock()

        # A lock to prevent multiple threads from solving captcha, used in selenium instances.
        captcha_lock = threading.Lock()

        serp_log = SERPLog()

        out('Going to scrape {num_keywords} keywords with {num_proxies} proxies by using {num_threads} threads.'
            .format(num_keywords=len(list(scrape_jobs)),
                    num_proxies=len(proxies),
                    num_threads=num_search_engines),
            lvl=1)

        progress_thread = None

        # Let the games begin
        if method in ('selenium', 'http'):

            # Show the progress of the scraping
            q = queue.Queue()
            progress_thread = ShowProgressQueue(q, len(scrape_jobs))
            progress_thread.start()

            workers = queue.Queue()
            num_worker = 0
            for search_engine in search_engines:

                for proxy in proxies:
                    for worker in range(num_workers):
                        num_worker += 1
                        workers.put(
                            ScrapeWorkerFactory(mode=method,
                                                proxy=proxy,
                                                search_engine=search_engine,
                                                session=session,
                                                db_lock=db_lock,
                                                cache_lock=cache_lock,
                                                scraper_search=scraper_search,
                                                captcha_lock=captcha_lock,
                                                progress_queue=q,
                                                browser_num=num_worker))

            for job in scrape_jobs:

                while True:
                    worker = workers.get()
                    workers.put(worker)
                    if worker.is_suitabe(job):
                        worker.add_job(job)
                        break

            threads = []

            while not workers.empty():
                worker = workers.get()
                thread = worker.get_worker(serp_log)
                if thread:
                    threads.append(thread)

            for t in threads:
                t.start()

            for t in threads:
                t.join()

            # after threads are done, stop the progress queue.
            q.put('done')

        elif method == 'http-async':
            scheduler = AsyncScrapeScheduler(scrape_jobs,
                                             session=session,
                                             scraper_search=scraper_search,
                                             db_lock=db_lock)
            scheduler.run()

        else:
            raise InvalidConfigurationException(
                'No such scrape_method {}'.format(
                    Config['SCRAPING'].get('scrape_method')))

        if method in ('selenium', 'http'):
            # progress_thread can be None
            try:
                progress_thread.join()
            except AttributeError:
                pass

    # in the end, close the json file.
    from GoogleScraper.output_converter import outfile, output_format

    if output_format == 'json':
        outfile.end()

    scraper_search.stopped_searching = datetime.datetime.utcnow()
    session.add(scraper_search)
    session.commit()

    ## Copy data to S3
    retry = Config['SCRAPE_INFOS'].getboolean('retry')
    if retry:
        print("Saving data to S3 as RETRY...")
    else:
        print("Saving data to S3...")

    table_objs = [
        ScraperSearch, SERP, Link, Proxy, SearchEngine, SearchEngineProxyStatus
    ]
    s3writers = [
        s3.S3Table(to,
                   Config['SCRAPE_INFOS'].get('scrape_id'),
                   Config['ENV'],
                   retry=retry) for to in table_objs
    ]
    for w in s3writers:
        w.load_data(session)
    for w in s3writers:
        w.write_buffer_to_s3()

    ## Save SERPS
    serp_log.write_to_s3()

    ##
    if return_results:
        return scraper_search

コード例 #3

ファイルを表示

def main(return_results=False, parse_cmd_line=True, config_from_dict=None):
    """Runs the GoogleScraper application as determined by the various configuration points.

    The main() function encompasses the core functionality of GoogleScraper. But it
    shouldn't be the main() functions job to check the validity of the provided
    configuration.

    Args:
        return_results: When GoogleScrape is used from within another program, don't print results to stdout,
                        store them in a database instead.
        parse_cmd_line: Whether to get options from the command line or not.
        config_from_dict: Configuration that is passed when GoogleScraper is called as library.
    Returns:
        A database session to the results when return_results is True. Else, nothing.
    """
    external_config_file_path = cmd_line_args = None

    if parse_cmd_line:
        cmd_line_args = get_command_line()

        if cmd_line_args.get('config_file', None):
            external_config_file_path = os.path.abspath(
                cmd_line_args.get('config_file'))

    config = get_config(cmd_line_args, external_config_file_path,
                        config_from_dict)

    if isinstance(config['log_level'], int):
        config['log_level'] = logging.getLevelName(config['log_level'])

    setup_logger(level=config.get('log_level').upper())

    if config.get('view_config', False):
        print(open(os.path.join(get_base_path(), 'scrape_config.py')).read())
        return

    if config.get('version'):
        from GoogleScraper.version import __version__
        print(__version__)
        return

    if config.get('clean', False):
        try:
            os.remove('google_scraper.db')
            if sys.platform == 'linux':
                os.system('rm {}/*'.format(config.get('cachedir')))
        except:
            pass
        return

    init_outfile(config, force_reload=True)

    kwfile = config.get('keyword_file', '')
    if kwfile:
        kwfile = os.path.abspath(kwfile)

    keyword = config.get('keyword')
    keywords = set(config.get('keywords', []))
    proxy_file = config.get('proxy_file', '')
    proxy_db = config.get('mysql_proxy_db', '')

    # when no search engine is specified, use google
    search_engines = config.get('search_engines', [
        'google',
    ])
    if not isinstance(search_engines, list):
        if search_engines == '*':
            search_engines = config.get('supported_search_engines')
        else:
            search_engines = search_engines.split(',')

    assert isinstance(search_engines,
                      list), 'Search engines must be a list like data type!'
    search_engines = set(search_engines)

    num_search_engines = len(search_engines)
    num_workers = int(config.get('num_workers'))
    scrape_method = config.get('scrape_method')
    pages = int(config.get('num_pages_for_keyword', 1))
    method = config.get('scrape_method', 'http')

    if config.get('shell', False):
        namespace = {}
        session_cls = get_session(config, scoped=False)
        namespace['session'] = session_cls()
        namespace['ScraperSearch'] = ScraperSearch
        namespace['SERP'] = SERP
        namespace['Link'] = Link
        namespace['Proxy'] = GoogleScraper.database.Proxy
        print('Available objects:')
        print('session - A sqlalchemy session of the results database')
        print('ScraperSearch - Search/Scrape job instances')
        print('SERP - A search engine results page')
        print('Link - A single link belonging to a SERP')
        print('Proxy - Proxies stored for scraping projects.')
        start_python_console(namespace)
        return

    if not (keyword or keywords) and not kwfile:
        # Just print the help.
        get_command_line(True)
        print(
            'No keywords to scrape for. Please provide either an keyword file (Option: --keyword-file) or specify and '
            'keyword with --keyword.')
        return

    cache_manager = CacheManager(config)

    if config.get('fix_cache_names'):
        cache_manager.fix_broken_cache_names()
        logger.info('renaming done. restart for normal use.')
        return

    keywords = [
        keyword,
    ] if keyword else keywords
    scrape_jobs = {}
    if kwfile:
        if not os.path.exists(kwfile):
            raise WrongConfigurationError(
                'The keyword file {} does not exist.'.format(kwfile))
        else:
            if kwfile.endswith('.py'):
                # we need to import the variable "scrape_jobs" from the module.
                sys.path.append(os.path.dirname(kwfile))
                try:
                    modname = os.path.split(kwfile)[-1].rstrip('.py')
                    scrape_jobs = getattr(
                        __import__(modname, fromlist=['scrape_jobs']),
                        'scrape_jobs')
                except ImportError as e:
                    logger.warning(e)
            else:
                # Clean the keywords of duplicates right in the beginning
                keywords = set([
                    line.strip() for line in open(
                        kwfile, 'r', encoding='utf8').read().split('\n')
                    if line.strip()
                ])

    if not scrape_jobs:
        scrape_jobs = default_scrape_jobs_for_keywords(keywords,
                                                       search_engines,
                                                       scrape_method, pages)

    scrape_jobs = list(scrape_jobs)

    if config.get('clean_cache_files', False):
        cache_manager.clean_cachefiles()
        return

    if config.get('check_oto', False):
        cache_manager._caching_is_one_to_one(keyword)

    if config.get('num_results_per_page') > 100:
        raise WrongConfigurationError(
            'Not more that 100 results per page available for searches.')

    proxies = []

    if proxy_db:
        proxies = get_proxies_from_mysql_db(proxy_db)
    elif proxy_file:
        proxies = parse_proxy_file(proxy_file)

    if config.get('use_own_ip'):
        proxies.append(None)

    if not proxies:
        raise Exception(
            'No proxies available and using own IP is prohibited by configuration. Turning down.'
        )

    valid_search_types = ('normal', 'video', 'news', 'image')
    if config.get('search_type') not in valid_search_types:
        raise WrongConfigurationError(
            'Invalid search type! Select one of {}'.format(
                repr(valid_search_types)))

    if config.get('simulate', False):
        print('*' * 60 + 'SIMULATION' + '*' * 60)
        logger.info(
            'If GoogleScraper would have been run without the --simulate flag, it would have:'
        )
        logger.info(
            'Scraped for {} keywords, with {} results a page, in total {} pages for each keyword'
            .format(len(keywords), int(config.get('num_results_per_page', 0)),
                    int(config.get('num_pages_for_keyword'))))
        if None in proxies:
            logger.info('Also using own ip address to scrape.')
        else:
            logger.info('Not scraping with own ip address.')
        logger.info('Used {} unique ip addresses in total'.format(
            len(proxies)))
        if proxies:
            logger.info('The following proxies are used: \n\t\t{}'.format(
                '\n\t\t'.join([
                    proxy.host + ':' + proxy.port for proxy in proxies if proxy
                ])))

        logger.info('By using {} mode with {} worker instances'.format(
            config.get('scrape_method'), int(config.get('num_workers'))))
        return

    # get a scoped sqlalchemy session
    session_cls = get_session(config, scoped=False)
    session = session_cls()

    # add fixtures
    fixtures(config, session)

    # add proxies to the database
    add_proxies_to_db(proxies, session)

    # ask the user to continue the last scrape. We detect a continuation of a
    # previously established scrape, if the keyword-file is the same and unmodified since
    # the beginning of the last scrape.
    scraper_search = None
    if kwfile and config.get('continue_last_scrape', False):
        searches = session.query(ScraperSearch). \
            filter(ScraperSearch.keyword_file == kwfile). \
            order_by(ScraperSearch.started_searching). \
            all()

        if searches:
            last_search = searches[-1]
            last_modified = datetime.datetime.utcfromtimestamp(
                os.path.getmtime(last_search.keyword_file))

            # if the last modification is older then the starting of the search
            if last_modified < last_search.started_searching:
                scraper_search = last_search
                logger.info('Continuing last scrape.')

    if not scraper_search:
        scraper_search = ScraperSearch(
            keyword_file=kwfile,
            number_search_engines_used=num_search_engines,
            number_proxies_used=len(proxies),
            number_search_queries=len(keywords),
            started_searching=datetime.datetime.utcnow(),
            used_search_engines=','.join(search_engines))

    # First of all, lets see how many requests remain to issue after searching the cache.
    if config.get('do_caching'):
        scrape_jobs = cache_manager.parse_all_cached_files(
            scrape_jobs, session, scraper_search)

    if scrape_jobs:

        # Create a lock to synchronize database access in the sqlalchemy session
        db_lock = threading.Lock()

        # create a lock to cache results
        cache_lock = threading.Lock()

        # A lock to prevent multiple threads from solving captcha, used in selenium instances.
        captcha_lock = threading.Lock()

        logger.info(
            'Going to scrape {num_keywords} keywords with {num_proxies} proxies by using {num_threads} threads.'
            .format(num_keywords=len(list(scrape_jobs)),
                    num_proxies=len(proxies),
                    num_threads=num_search_engines))

        progress_thread = None

        # Let the games begin
        if method in ('selenium', 'http'):

            # Show the progress of the scraping
            q = queue.Queue()
            progress_thread = ShowProgressQueue(config, q, len(scrape_jobs))
            progress_thread.start()

            workers = queue.Queue()
            num_worker = 0
            for search_engine in search_engines:

                for proxy in proxies:

                    for worker in range(num_workers):
                        num_worker += 1
                        workers.put(
                            ScrapeWorkerFactory(config,
                                                cache_manager=cache_manager,
                                                mode=method,
                                                proxy=proxy,
                                                search_engine=search_engine,
                                                session=session,
                                                db_lock=db_lock,
                                                cache_lock=cache_lock,
                                                scraper_search=scraper_search,
                                                captcha_lock=captcha_lock,
                                                progress_queue=q,
                                                browser_num=num_worker))

            # here we look for suitable workers
            # for all jobs created.
            for job in scrape_jobs:
                while True:
                    worker = workers.get()
                    workers.put(worker)
                    if worker.is_suitabe(job):
                        worker.add_job(job)
                        break

            threads = []

            while not workers.empty():
                worker = workers.get()
                thread = worker.get_worker()
                if thread:
                    threads.append(thread)

            for t in threads:
                t.start()

            for t in threads:
                t.join()

            # after threads are done, stop the progress queue.
            q.put('done')
            progress_thread.join()

        elif method == 'http-async':
            scheduler = AsyncScrapeScheduler(config,
                                             scrape_jobs,
                                             cache_manager=cache_manager,
                                             session=session,
                                             scraper_search=scraper_search,
                                             db_lock=db_lock)
            scheduler.run()

        else:
            raise Exception('No such scrape_method {}'.format(
                config.get('scrape_method')))

    from GoogleScraper.output_converter import close_outfile
    close_outfile()

    scraper_search.stopped_searching = datetime.datetime.utcnow()
    session.add(scraper_search)
    session.commit()

    if return_results:
        return scraper_search

コード例 #4

ファイルを表示

def main(return_results=False, parse_cmd_line=True, config_from_dict=None):
    """Runs the GoogleScraper application as determined by the various configuration points.

    The main() function encompasses the core functionality of GoogleScraper. But it
    shouldn't be the main() functions job to check the validity of the provided
    configuration.

    Args:
        return_results: When GoogleScrape is used from within another program, don't print results to stdout,
                        store them in a database instead.
        parse_cmd_line: Whether to get options from the command line or not.
        config_from_dict: Configuration that is passed when GoogleScraper is called as library.
    Returns:
        A database session to the results when return_results is True. Else, nothing.
    """

    external_config_file_path = cmd_line_args = None

    if parse_cmd_line:
        cmd_line_args = get_command_line()

        if cmd_line_args.get('config_file', None):
            external_config_file_path = os.path.abspath(
                cmd_line_args.get('config_file'))

    config = get_config(cmd_line_args, external_config_file_path,
                        config_from_dict)

    keywords = config.get('keywords')
    kwfile = config.get('keyword_file', None)

    if isinstance(config['log_level'], int):
        config['log_level'] = logging.getLevelName(config['log_level'])

    setup_logger(level=config.get('log_level').upper())

    if config.get('view_config', False):
        print(open(os.path.join(get_base_path(), 'scrape_config.py')).read())
        return

    if config.get('version'):
        from GoogleScraper.version import __version__
        print(__version__)
        return

    if config.get('clean', False):
        try:
            os.remove('google_scraper.db')
            if sys.platform == 'linux':
                os.system('rm {}/*'.format(config.get('cachedir')))
        except:
            pass
        return

    init_outfile(config, force_reload=True)  # in output_converter.py

    proxy_file = config.get('proxy_file', '')
    proxy_db = config.get('mysql_proxy_db', '')

    setup_shell_config(config)
    search_engines = get_search_engines(
        config.get('search_engines', ['google']),
        config.get('supported_search_engines'))

    num_search_engines = len(search_engines)
    num_workers = int(config.get('num_workers'))
    scrape_method = config.get('scrape_method')
    pages = int(config.get('num_pages_for_keyword', 1))
    method = config.get('scrape_method', 'http')

    cache_manager = CacheManager(config)
    if config.get('fix_cache_names'):
        cache_manager.fix_broken_cache_names()
        logger.info('renaming done. restart for normal use.')
        return

    scrape_jobs = {}

    if not scrape_jobs:
        scrape_jobs = default_scrape_jobs_for_keywords(keywords,
                                                       search_engines,
                                                       scrape_method, pages)

    scrape_jobs = list(scrape_jobs)

    if config.get('clean_cache_files', False):
        cache_manager.clean_cachefiles()
        return

    if config.get('check_oto', False):
        cache_manager._caching_is_one_to_one(keyword)

    if config.get('num_results_per_page') > 100:
        raise WrongConfigurationError(
            'Not more that 100 results per page available for searches.')

    proxies = []

    if proxy_db:
        proxies = get_proxies_from_mysql_db(proxy_db)
    elif proxy_file:
        proxies = parse_proxy_file(proxy_file)

    if config.get('use_own_ip'):
        proxies.append(None)

    if not proxies:
        raise Exception(
            'No proxies available and using own IP is prohibited by configuration. Turning down.'
        )

    if config.get('search_type') not in VALID_SEARCH_TYPES:
        raise WrongConfigurationError(
            'Invalid search type! Select one of {}'.format(
                repr(VALID_SEARCH_TYPES)))

    if config.get('simulate', False):
        run_simulation(config.get('num_results_per_page', 0),
                       config.get('num_pages_for_keyword'),
                       config.get('num_workers'))

    # get a scoped sqlalchemy session
    session_cls = get_session(config, scoped=False)
    session = session_cls()

    # add fixtures
    fixtures(config, session)

    # add proxies to the database
    add_proxies_to_db(proxies, session)

    # ask the user to continue the last scrape. We detect a continuation of a
    # previously established scrape, if the keyword-file is the same and unmodified since
    # the beginning of the last scrape.
    scraper_search = None

    if kwfile and config.get('continue_last_scrape', False):
        searches = session.query(ScraperSearch). \
            filter(ScraperSearch.keyword_file == kwfile). \
            order_by(ScraperSearch.started_searching). \
            all()

        if searches:
            last_search = searches[-1]
            last_modified = datetime.datetime.utcfromtimestamp(
                os.path.getmtime(last_search.keyword_file))

            # if the last modification is older then the starting of the search
            if last_modified < last_search.started_searching:
                scraper_search = last_search
                logger.info('Continuing last scrape.')

    if not scraper_search:
        scraper_search = ScraperSearch(
            keyword_file=kwfile,
            number_search_engines_used=num_search_engines,
            number_proxies_used=len(proxies),
            number_search_queries=len(keywords),
            started_searching=datetime.datetime.utcnow(),
            used_search_engines=','.join(search_engines))

    # First of all, lets see how many requests remain to issue after searching the cache.
    if config.get('do_caching'):
        scrape_jobs = cache_manager.parse_all_cached_files(
            scrape_jobs, session, scraper_search)

    if scrape_jobs:
        # Create a lock to synchronize database access in the sqlalchemy session
        db_lock = threading.Lock()

        # create a lock to cache results
        cache_lock = threading.Lock()

        # A lock to prevent multiple threads from solving captcha, used in selenium instances.
        captcha_lock = threading.Lock()

        logger.info(
            'Going to scrape {num_keywords} keywords with {num_proxies} proxies by using {num_threads} threads.'
            .format(num_keywords=len(list(scrape_jobs)),
                    num_proxies=len(proxies),
                    num_threads=num_search_engines))

        progress_thread = None

        # Let the games begin

        if method in ('selenium', 'http'):

            # Show the progress of the scraping
            q = queue.Queue()
            progress_thread = ShowProgressQueue(config, q, len(scrape_jobs))
            progress_thread.start()

            workers = queue.Queue()
            num_worker = 0
            for search_engine in search_engines:

                for proxy in proxies:

                    for worker in range(num_workers):
                        num_worker += 1
                        workers.put(
                            ScrapeWorkerFactory(config,
                                                cache_manager=cache_manager,
                                                mode=method,
                                                proxy=proxy,
                                                search_engine=search_engine,
                                                session=session,
                                                db_lock=db_lock,
                                                cache_lock=cache_lock,
                                                scraper_search=scraper_search,
                                                captcha_lock=captcha_lock,
                                                progress_queue=q,
                                                browser_num=num_worker))

            # here we look for suitable workers
            # for all jobs created.
            for job in scrape_jobs:
                while True:
                    worker = workers.get()
                    workers.put(worker)
                    if worker.is_suitabe(job):
                        worker.add_job(job)
                        break

            threads = []

            while not workers.empty():
                worker = workers.get()
                thread = worker.get_worker()
                if thread:
                    threads.append(thread)

            for t in threads:
                t.start()

            for t in threads:
                t.join()

            # after threads are done, stop the progress queue.
            progress_thread.join()

        elif method == 'http-async':
            scheduler = AsyncScrapeScheduler(config,
                                             scrape_jobs,
                                             cache_manager=cache_manager,
                                             session=session,
                                             scraper_search=scraper_search,
                                             db_lock=db_lock)
            scheduler.run()

        else:
            raise Exception('No such scrape_method {}'.format(
                config.get('scrape_method')))

    from GoogleScraper.output_converter import close_outfile
    close_outfile()

    scraper_search.stopped_searching = datetime.datetime.utcnow()
    session.add(scraper_search)
    session.commit()

    if return_results:
        return scraper_search