Python grouper Examples

Programming Language: Python

Namespace/Package Name: GoogleScraper.utils

Method/Function: grouper

Examples at hotexamples.com: 5

Python grouper - 5 examples found. These are the top rated real world Python examples of GoogleScraper.utils.grouper extracted from open source projects. You can rate examples to help us improve the quality of examples.

Example #1

Show file

File: core.py Project: rakeshmukundan/GoogleScraper

def assign_keywords_to_scrapers(all_keywords):
    """Scrapers are often threads or asynchronous objects.

    Splitting the keywords equally on the workers is crucial
    for maximal performance.

    Args:
        all_keywords: All keywords to scrape

    Returns:
        A list of list. The inner list should be assigned to individual scrapers.
    """
    mode = Config['SCRAPING'].get('scrapemethod')


    if mode == 'sel':
        num_scrapers = Config['SELENIUM'].getint('num_browser_instances', 1)
    elif mode == 'http':
        num_scrapers = Config['HTTP'].getint('num_threads', 1)
    else:
        num_scrapers = 0

    if len(all_keywords) > num_scrapers:
        kwgroups = grouper(all_keywords, len(all_keywords)//num_scrapers, fillvalue=None)
    else:
        # thats a little special there :)
        kwgroups = [[kw, ] for kw in all_keywords]

    return kwgroups

Example #2

Show file

File: core.py Project: vgoklani/GoogleScraper

def assign_keywords_to_scrapers(all_keywords):
    """Scrapers are often threads or asynchronous objects.

    Splitting the keywords equally on the workers is crucial
    for maximal performance.

    Args:
        all_keywords: All keywords to scrape

    Returns:
        A list of list. The inner list should be assigned to individual scrapers.
    """
    mode = Config['SCRAPING'].get('scrapemethod')

    if mode == 'sel':
        num_scrapers = Config['SELENIUM'].getint('num_browser_instances', 1)
    elif mode == 'http':
        num_scrapers = Config['HTTP'].getint('num_threads', 1)
    else:
        num_scrapers = 0

    if len(all_keywords) > num_scrapers:
        kwgroups = grouper(all_keywords,
                           len(all_keywords) // num_scrapers,
                           fillvalue=None)
    else:
        # thats a little special there :)
        kwgroups = [[
            kw,
        ] for kw in all_keywords]

    return kwgroups

Example #3

Show file

File: core.py Project: Sdlearn/GoogleScraper

def main():
    """Runs the GoogleScraper application as determined by the various configuration points."""
    global Config
    Config = get_config(True, True)

    if Config['GLOBAL'].getboolean('view_config'):
        from GoogleScraper.config import CONFIG_FILE
        print(open(CONFIG_FILE).read())
        sys.exit(0)

    if Config['GLOBAL'].getboolean('do_caching'):
        d = Config['GLOBAL'].get('cachedir')
        if not os.path.exists(d):
            os.mkdir(d, 0o744)
        else:
            maybe_clean_cache()

    kwfile = Config['SCRAPING'].get('keyword_file')
    keyword = Config['SCRAPING'].get('keyword')
    keywords = set(Config['SCRAPING'].get('keywords', '').split('\n'))
    proxy_file = Config['GLOBAL'].get('proxy_file', '')
    proxy_db = Config['GLOBAL'].get('mysql_proxy_db', '')

    if not (keyword or keywords) and not kwfile:
        raise ValueError('You must specify a keyword file (separated by newlines, each keyword on a line) with the flag `--keyword-file {filepath}~')

    if Config['GLOBAL'].getboolean('fix_cache_names'):
        fix_broken_cache_names()
        sys.exit('renaming done. restart for normal use.')

    keywords = [keyword,] if keyword else keywords
    if kwfile:
        if not os.path.exists(kwfile):
            raise ValueError('The keyword file {} does not exist.'.format(kwfile))
        else:
            # Clean the keywords of duplicates right in the beginning
            keywords = set([line.strip() for line in open(kwfile, 'r').read().split('\n')])

    if Config['GLOBAL'].getboolean('check_oto', False):
        _caching_is_one_to_one(keyword)

    if Config['SCRAPING'].getint('num_results_per_page') > 100:
        raise ValueError('Not more that 100 results per page available for Google searches.')

    if proxy_db:
        proxies = get_proxies_from_mysql_db(proxy_db)
    elif proxy_file:
        proxies = parse_proxy_file(proxy_file)
    else:
        proxies = []

    valid_search_types = ('normal', 'video', 'news', 'image')
    if Config['SCRAPING'].get('search_type') not in valid_search_types:
        ValueError('Invalid search type! Select one of {}'.format(repr(valid_search_types)))

    # Let the games begin
    if Config['SCRAPING'].get('scrapemethod', '') == 'sel':
        conn = maybe_create_db()
        # First of all, lets see how many keywords remain to scrape after parsing the cache
        if Config['GLOBAL'].getboolean('do_caching'):
            remaining = parse_all_cached_files(keywords, conn, simulate=Config['GLOBAL'].getboolean('simulate'))
        else:
            remaining = keywords

        if Config['GLOBAL'].getboolean('simulate'):
            # TODO: implement simulation
            raise NotImplementedError('Simulating is not implemented yet!')

        # Create a lock to sync file access
        rlock = threading.RLock()

        # A lock to prevent multiple threads from solving captcha.
        lock = threading.Lock()

        max_sel_browsers = Config['SELENIUM'].getint('num_browser_instances')
        if len(remaining) > max_sel_browsers:
            kwgroups = grouper(remaining, len(remaining)//max_sel_browsers, fillvalue=None)
        else:
            # thats a little special there :)
            kwgroups = [[kw, ] for kw in remaining]

        # Distribute the proxies evenly on the kws to search
        scrapejobs = []
        Q = queue.Queue()
        proxies.append(None) if Config['SCRAPING'].getboolean('use_own_ip') else None
        if not proxies:
            logger.info("No ip's available for scanning.")

        chunks_per_proxy = math.ceil(len(kwgroups)/len(proxies))
        for i, chunk in enumerate(kwgroups):
            scrapejobs.append(SelScraper(chunk, rlock, Q, captcha_lock=lock, browser_num=i, proxy=proxies[i//chunks_per_proxy]))

        for t in scrapejobs:
            t.start()

        handler = ResultsHandler(Q, conn)
        handler.start()

        for t in scrapejobs:
            t.join()

        # All scrape jobs done, signal the db handler to stop
        Q.put(Config['GLOBAL'].get('all_processed_sig'))
        handler.join()

        conn.commit()
        conn.close()
    elif Config['SCRAPING'].get('scrapemethod') == 'http':
        if Config['SCRAPING'].getboolean('deep_scrape', False):
            # TODO: implement deep scrape
            raise NotImplementedError('Sorry. Currently deep_scrape is not implemented.')

        else:
            results = []
            for kw in keywords:
                r = scrape(kw, num_results_per_page=Config['SCRAPING'].getint('num_results_per_page', 10),
                           num_pages=Config['SCRAPING'].getint('num_pages', 1), scrapemethod='http')
                results.append(r)
        if Config['GLOBAL'].get('print'):
            print_scrape_results_http(results, Config['GLOBAL'].getint('verbosity', 0), view=Config['HTTP'].get('view', False))
    else:
        raise ValueError('No such scrapemethod. Use "http" or "sel"')

Example #4

Show file

File: core.py Project: m7mdcc/GoogleScraper

def main(return_results=True, force_reload=False, proxies=[]):
    """Runs the GoogleScraper application as determined by the various configuration points.

    Keyword arguments:
    return_results -- Whether the GoogleScraper application is run programmatically. Will return all scraped results.
    """
    parse_cmd_args()

    if Config['GLOBAL'].getboolean('view_config'):
        from GoogleScraper.config import CONFIG_FILE
        print(open(CONFIG_FILE).read())
        sys.exit(0)

    if Config['GLOBAL'].getboolean('do_caching'):
        d = Config['GLOBAL'].get('cachedir')
        if not os.path.exists(d):
            os.mkdir(d, 0o744)
        else:
            maybe_clean_cache()

    kwfile = Config['SCRAPING'].get('keyword_file')
    keyword = Config['SCRAPING'].get('keyword')
    keywords = set(Config['SCRAPING'].get('keywords', '').split('\n'))
    proxy_file = Config['GLOBAL'].get('proxy_file', '')
    proxy_db = Config['GLOBAL'].get('mysql_proxy_db', '')

    if not (keyword or keywords) and not kwfile:
        raise InvalidConfigurationException('You must specify a keyword file (separated by newlines, each keyword on a line) with the flag `--keyword-file {filepath}~')

    if Config['GLOBAL'].getboolean('fix_cache_names'):
        fix_broken_cache_names()
        sys.exit('renaming done. restart for normal use.')

    keywords = [keyword, ] if keyword else keywords
    if kwfile:
        if not os.path.exists(kwfile):
            raise InvalidConfigurationException('The keyword file {} does not exist.'.format(kwfile))
        else:
            # Clean the keywords of duplicates right in the beginning
            keywords = set([line.strip() for line in open(kwfile, 'r').read().split('\n')])

    if Config['GLOBAL'].getboolean('check_oto', False):
        _caching_is_one_to_one(keyword)

    if Config['SCRAPING'].getint('num_results_per_page') > 100:
        raise InvalidConfigurationException('Not more that 100 results per page available for Google searches.')

    if not proxies:
        # look for proxies in mysql database or a proxy file if not given as keyword argument
        if proxy_db:
            proxies = get_proxies_from_mysql_db(proxy_db)
        elif proxy_file:
            proxies = parse_proxy_file(proxy_file)

    valid_search_types = ('normal', 'video', 'news', 'image')
    if Config['SCRAPING'].get('search_type') not in valid_search_types:
        InvalidConfigurationException('Invalid search type! Select one of {}'.format(repr(valid_search_types)))

    # Create a sqlite database to store the results
    conn = maybe_create_db()
    if Config['GLOBAL'].getboolean('simulate'):
        print('*' * 60 + 'SIMULATION' + '*' * 60)
        logger.info('If GoogleScraper would have been run without the --simulate flag, it would have')
        logger.info('Scraped for {} keywords (before caching), with {} results a page, in total {} pages for each keyword'.format(
            len(keywords), Config['SCRAPING'].getint('num_results_per_page', 0), Config['SCRAPING'].getint('num_of_pages')))
        logger.info('Used {} distinct proxies in total, with the following ip addresses: {}'.format(
            len(proxies), '\t\t\n'.join(proxies)
        ))
        if Config['SCRAPING'].get('scrapemethod') == 'sel':
            mode = 'selenium mode with {} browser instances'.format(Config['SELENIUM'].getint('num_browser_instances'))
        else:
            mode = 'http mode'
        logger.info('By using {}'.format(mode))
        sys.exit(0)

    # Let the games begin
    if Config['SCRAPING'].get('scrapemethod', '') == 'sel':
        # First of all, lets see how many keywords remain to scrape after parsing the cache
        if Config['GLOBAL'].getboolean('do_caching'):
            remaining = parse_all_cached_files(keywords, conn, simulate=Config['GLOBAL'].getboolean('simulate'))
        else:
            remaining = keywords


        # Create a lock to sync file access
        rlock = threading.RLock()

        # A lock to prevent multiple threads from solving captcha.
        lock = threading.Lock()

        max_sel_browsers = Config['SELENIUM'].getint('num_browser_instances')
        if len(remaining) > max_sel_browsers:
            kwgroups = grouper(remaining, len(remaining)//max_sel_browsers, fillvalue=None)
        else:
            # thats a little special there :)
            kwgroups = [[kw, ] for kw in remaining]

        # Distribute the proxies evenly on the keywords to search for
        scrapejobs = []
        Q = queue.Queue()

        if Config['SCRAPING'].getboolean('use_own_ip'):
            proxies.append(None)
        elif not proxies:
            raise InvalidConfigurationException("No proxies available and using own IP is prohibited by configuration. Turning down.")

        chunks_per_proxy = math.ceil(len(kwgroups)/len(proxies))
        for i, chunk in enumerate(kwgroups):
            scrapejobs.append(SelScraper(chunk, rlock, Q, captcha_lock=lock, browser_num=i, proxy=proxies[i//chunks_per_proxy]))

        for t in scrapejobs:
            t.start()

        handler = ResultsHandler(Q, conn)
        handler.start()

        for t in scrapejobs:
            t.join()

        # All scrape jobs done, signal the db handler to stop
        Q.put(Config['GLOBAL'].get('all_processed_sig'))
        handler.join()

        conn.commit()

        if return_results:
            return conn
        else:
            conn.close()

    elif Config['SCRAPING'].get('scrapemethod') == 'http':
        results = []
        cursor = conn.cursor()
        if Config['SCRAPING'].getboolean('deep_scrape', False):
            # TODO: implement deep scrape
            raise NotImplementedError('Sorry. Currently deep scrape is not implemented.')
        else:
            for i, kw in enumerate(keywords):
                r = scrape(kw, num_results_per_page=Config['SCRAPING'].getint('num_results_per_page', 10),
                           num_pages=Config['SCRAPING'].getint('num_pages', 1), scrapemethod='http')

                if r:
                    cursor.execute('INSERT INTO serp_page (page_number, requested_at, num_results, num_results_for_kw_google, search_query) VALUES(?,?,?,?,?)',
                                 (i, datetime.datetime.utcnow(), 0, 0, kw))
                    serp_id = cursor.lastrowid
                    for result in r:
                        for result_set in ('results', 'ads_main', 'ads_aside'):
                            if result_set in result.keys():
                                for title, snippet, url, pos in result[result_set]:
                                    cursor.execute('INSERT INTO link (title, snippet, url, domain, rank, serp_id) VALUES(?, ?, ?, ?, ?, ?)',
                                        (title, snippet, url.geturl(), url.netloc, pos, serp_id))
                results.append(r)
            cursor.close()
        if Config['GLOBAL'].get('print'):
            print_scrape_results_http(results, Config['GLOBAL'].getint('verbosity', 0))
        return conn
    else:
        raise InvalidConfigurationException('No such scrapemethod. Use "http" or "sel"')

Example #5

Show file

File: core.py Project: csrgxtu/gps

def main(return_results=True, force_reload=True, proxies=[]):
    """Runs the GoogleScraper application as determined by the various configuration points.

    Keyword arguments:
    return_results -- Whether the GoogleScraper application is run programmatically. Will return all scraped results.
    """
    global Config
    Config = get_config(True, force_reload)

    if Config['GLOBAL'].getboolean('view_config'):
        from GoogleScraper.config import CONFIG_FILE
        print(open(CONFIG_FILE).read())
        sys.exit(0)

    if Config['GLOBAL'].getboolean('do_caching'):
        d = Config['GLOBAL'].get('cachedir')
        if not os.path.exists(d):
            os.mkdir(d, 0o744)
        else:
            maybe_clean_cache()

    kwfile = Config['SCRAPING'].get('keyword_file')
    keyword = Config['SCRAPING'].get('keyword')
    keywords = set(Config['SCRAPING'].get('keywords', '').split('\n'))
    proxy_file = Config['GLOBAL'].get('proxy_file', '')
    proxy_db = Config['GLOBAL'].get('mysql_proxy_db', '')

    if not (keyword or keywords) and not kwfile:
        raise InvalidConfigurationException(
            'You must specify a keyword file (separated by newlines, each keyword on a line) with the flag `--keyword-file {filepath}~'
        )

    if Config['GLOBAL'].getboolean('fix_cache_names'):
        fix_broken_cache_names()
        sys.exit('renaming done. restart for normal use.')

    keywords = [
        keyword,
    ] if keyword else keywords
    if kwfile:
        if not os.path.exists(kwfile):
            raise InvalidConfigurationException(
                'The keyword file {} does not exist.'.format(kwfile))
        else:
            # Clean the keywords of duplicates right in the beginning
            keywords = set([
                line.strip() for line in open(kwfile, 'r').read().split('\n')
            ])

    if Config['GLOBAL'].getboolean('check_oto', False):
        _caching_is_one_to_one(keyword)

    if Config['SCRAPING'].getint('num_results_per_page') > 100:
        raise InvalidConfigurationException(
            'Not more that 100 results per page available for Google searches.'
        )

    if not proxies:
        # look for proxies in mysql database or a proxy file if not given as keyword argument
        if proxy_db:
            proxies = get_proxies_from_mysql_db(proxy_db)
        elif proxy_file:
            proxies = parse_proxy_file(proxy_file)

    valid_search_types = ('normal', 'video', 'news', 'image')
    if Config['SCRAPING'].get('search_type') not in valid_search_types:
        InvalidConfigurationException(
            'Invalid search type! Select one of {}'.format(
                repr(valid_search_types)))

    # Create a sqlite database to store the results
    conn = maybe_create_db()
    if Config['GLOBAL'].getboolean('simulate'):
        print('*' * 60 + 'SIMULATION' + '*' * 60)
        logger.info(
            'If GoogleScraper would have been run without the --simulate flag, it would have'
        )
        logger.info(
            'Scraped for {} keywords (before caching), with {} results a page, in total {} pages for each keyword'
            .format(len(keywords),
                    Config['SCRAPING'].getint('num_results_per_page', 0),
                    Config['SCRAPING'].getint('num_of_pages')))
        logger.info(
            'Used {} distinct proxies in total, with the following ip addresses: {}'
            .format(len(proxies), '\t\t\n'.join(proxies)))
        if Config['SCRAPING'].get('scrapemethod') == 'sel':
            mode = 'selenium mode with {} browser instances'.format(
                Config['SELENIUM'].getint('num_browser_instances'))
        else:
            mode = 'http mode'
        logger.info('By using {}'.format(mode))
        sys.exit(0)

    # Let the games begin
    if Config['SCRAPING'].get('scrapemethod', '') == 'sel':
        # First of all, lets see how many keywords remain to scrape after parsing the cache
        if Config['GLOBAL'].getboolean('do_caching'):
            remaining = parse_all_cached_files(
                keywords,
                conn,
                simulate=Config['GLOBAL'].getboolean('simulate'))
        else:
            remaining = keywords

        # Create a lock to sync file access
        rlock = threading.RLock()

        # A lock to prevent multiple threads from solving captcha.
        lock = threading.Lock()

        max_sel_browsers = Config['SELENIUM'].getint('num_browser_instances')
        if len(remaining) > max_sel_browsers:
            kwgroups = grouper(remaining,
                               len(remaining) // max_sel_browsers,
                               fillvalue=None)
        else:
            # thats a little special there :)
            kwgroups = [[
                kw,
            ] for kw in remaining]

        # Distribute the proxies evenly on the keywords to search for
        scrapejobs = []
        Q = queue.Queue()

        if Config['SCRAPING'].getboolean('use_own_ip'):
            proxies.append(None)
        elif not proxies:
            raise InvalidConfigurationException(
                "No proxies available and using own IP is prohibited by configuration. Turning down."
            )

        chunks_per_proxy = math.ceil(len(kwgroups) / len(proxies))
        for i, chunk in enumerate(kwgroups):
            scrapejobs.append(
                SelScraper(chunk,
                           rlock,
                           Q,
                           captcha_lock=lock,
                           browser_num=i,
                           proxy=proxies[i // chunks_per_proxy]))

        for t in scrapejobs:
            t.start()

        handler = ResultsHandler(Q, conn)
        handler.start()

        for t in scrapejobs:
            t.join()

        # All scrape jobs done, signal the db handler to stop
        Q.put(Config['GLOBAL'].get('all_processed_sig'))
        handler.join()

        conn.commit()

        if return_results:
            return conn
        else:
            conn.close()

    elif Config['SCRAPING'].get('scrapemethod') == 'http':
        results = []
        cursor = conn.cursor()
        if Config['SCRAPING'].getboolean('deep_scrape', False):
            # TODO: implement deep scrape
            raise NotImplementedError(
                'Sorry. Currently deep scrape is not implemented.')
        else:
            for i, kw in enumerate(keywords):
                r = scrape(kw,
                           num_results_per_page=Config['SCRAPING'].getint(
                               'num_results_per_page', 10),
                           num_pages=Config['SCRAPING'].getint('num_pages', 1),
                           scrapemethod='http')

                if r:
                    cursor.execute(
                        'INSERT INTO serp_page (page_number, requested_at, num_results, num_results_for_kw_google, search_query) VALUES(?,?,?,?,?)',
                        (i, datetime.datetime.utcnow(), 0, 0, kw))
                    serp_id = cursor.lastrowid
                    for result in r:
                        for result_set in ('results', 'ads_main', 'ads_aside'):
                            if result_set in result.keys():
                                for title, snippet, url, pos in result[
                                        result_set]:
                                    cursor.execute(
                                        'INSERT INTO link (title, snippet, url, domain, rank, serp_id) VALUES(?, ?, ?, ?, ?, ?)',
                                        (title, snippet, url.geturl(),
                                         url.netloc, pos, serp_id))
                results.append(r)
            cursor.close()
        if Config['GLOBAL'].get('print'):
            print_scrape_results_http(results,
                                      Config['GLOBAL'].getint('verbosity', 0))
        return conn
    else:
        raise InvalidConfigurationException(
            'No such scrapemethod. Use "http" or "sel"')