def __init__(self,
                 config,
                 query='',
                 page_number=1,
                 search_engine='google',
                 scrape_method='http-async'):
        """
        """
        self.config = config
        self.query = query
        self.page_number = page_number
        self.search_engine_name = search_engine
        self.search_type = 'normal'
        self.scrape_method = scrape_method
        self.requested_at = None
        self.requested_by = 'localhost'
        self.parser = get_parser_by_search_engine(self.search_engine_name)
        self.base_search_url = get_base_search_url_by_search_engine(
            self.config, self.search_engine_name, 'http')
        self.params = get_GET_params_for_search_engine(
            self.query, self.search_engine_name, search_type=self.search_type)
        self.headers = headers
        self.status = 'successful'

        self.num_results_per_page = int(config['num_results_per_page'])
        self.startRecord = self.num_results_per_page * (self.page_number -
                                                        1) + 1
        self.stringStartRecord = "&first=" + str(self.startRecord)
    def get_parser_for_file(self, se, file, **kwargs):
        with open(file, 'r') as f:
            html = f.read()
            parser = get_parser_by_search_engine(se)
            parser = parser(html, **kwargs)

        return parser
Beispiel #3
0
    def get_parser_for_file(self, se, file, **kwargs):
        with open(file, 'r') as f:
            html = f.read()
            parser = get_parser_by_search_engine(se)
            parser = parser(html, **kwargs)

        return parser
    def get_parser_for_file(self, se, file, **kwargs):
        file = os.path.join(base, file)
        with open(file, 'r') as f:
            html = f.read()
            parser = get_parser_by_search_engine(se)
            parser = parser(config, html, **kwargs)

        return parser
    def get_parser_for_file(self, se, file, **kwargs):
        file = os.path.join(base, file)
        with open(file, 'r') as f:
            html = f.read()
            parser = get_parser_by_search_engine(se)
            parser = parser(config, html, **kwargs)

        return parser
Beispiel #6
0
    def build_search(self):
        """Build the headers and params for the search request for the search engine."""

        self.search_params = get_GET_params_for_search_engine(self.query, self.search_engine_name,
                                                              self.page_number, self.num_results_per_page,
                                                              self.search_type)

        self.parser = get_parser_by_search_engine(self.search_engine_name)
        self.parser = self.parser(config=self.config)
Beispiel #7
0
    def build_search(self):
        """Build the headers and params for the search request for the search engine."""

        self.search_params = get_GET_params_for_search_engine(
            self.query, self.search_engine_name, self.config, self.page_number,
            self.num_results_per_page, self.search_type)

        self.parser = get_parser_by_search_engine(self.search_engine_name)
        self.parser = self.parser(config=self.config)
Beispiel #8
0
 def __init__(self, query='', page_number=1, search_engine='google', **kwargs):
     self.query = query
     self.page_number = page_number
     self.search_engine_name = search_engine
     self.search_type = 'normal'
     self.scrape_method = 'http-async'
     self.requested_at = None
     self.requested_by = ''
     self.parser = get_parser_by_search_engine(self.search_engine_name)
     self.base_search_url = get_base_search_url_by_search_engine(self.search_engine_name, 'http')
     self.params = get_GET_params_for_search_engine(self.query, self.search_engine_name, search_type=self.search_type)
     self.headers = headers
Beispiel #9
0
 def __init__(self, config, query='', page_number=1, search_engine='google', scrape_method='http-async'):
     """
     """
     self.config = config
     self.query = query
     self.page_number = page_number
     self.search_engine_name = search_engine
     self.search_type = 'normal'
     self.scrape_method = scrape_method
     self.requested_at = None
     self.requested_by = 'localhost'
     self.parser = get_parser_by_search_engine(self.search_engine_name)
     self.base_search_url = get_base_search_url_by_search_engine(self.config, self.search_engine_name, 'http')
     self.params = get_GET_params_for_search_engine(self.query, self.search_engine_name,
                                                    search_type=self.search_type)
     self.headers = headers
     self.status = 'successful'
Beispiel #10
0
 def __init__(self, query='', page_number=1, search_engine='google', **kwargs):
     """
     @todo: **kwargs doesn't seem to be used, check if any call to init passes additional keyword args and remove it
     """
     self.query = query
     self.page_number = page_number
     self.search_engine_name = search_engine
     self.search_type = 'normal'
     self.scrape_method = 'http-async'
     self.requested_at = None
     self.requested_by = 'localhost'
     self.parser = get_parser_by_search_engine(self.search_engine_name)
     self.base_search_url = get_base_search_url_by_search_engine(self.search_engine_name, 'http')
     self.params = get_GET_params_for_search_engine(self.query, self.search_engine_name,
                                                    search_type=self.search_type)
     self.headers = headers
     self.status = 'successful'
Beispiel #11
0
 def __init__(self,
              config,
              query='',
              page_number=1,
              search_engine='google',
              scrape_method='http-async'):
     """
     """
     self.config = config
     self.query = query
     self.page_number = page_number
     self.search_engine_name = search_engine
     self.search_type = 'normal'
     self.scrape_method = scrape_method
     self.requested_at = None
     self.requested_by = 'localhost'
     self.parser = get_parser_by_search_engine(self.search_engine_name)
     self.base_search_url = get_base_search_url_by_search_engine(
         self.config, self.search_engine_name, 'http')
     self.params = get_GET_params_for_search_engine(
         self.query, self.search_engine_name, search_type=self.search_type)
     self.headers = headers
     self.status = 'successful'
Beispiel #12
0
    def __init__(self, keywords=None, scraper_search=None, session=None, db_lock=None, cache_lock=None,
                 start_page_pos=1, search_engine=None, search_type=None, proxy=None, progress_queue=None):
        """Instantiate an SearchEngineScrape object.

        Args:
            TODO
        """
        self.search_engine = search_engine
        assert self.search_engine, 'You need to specify an search_engine'

        self.search_engine = self.search_engine.lower()

        if not search_type:
            self.search_type = Config['SCRAPING'].get('search_type', 'normal')
        else:
            self.search_type = search_type
            
        # The number of pages to scrape for each keyword
        self.num_pages_per_keyword = Config['SCRAPING'].getint('num_pages_for_keyword', 1)
        
        # The keywords that need to be scraped
        # If a SearchEngineScrape receives explicitly keywords,
        # scrape them. otherwise scrape the ones specified in the Config.
        if keywords:
            self.keywords = keywords
        else:
            self.keywords = Config['SCRAPING'].get('keywords', [])

        self.keywords = list(set(self.keywords))

        # the number of keywords
        self.num_keywords = len(self.keywords)
        
        # The actual keyword that is to be scraped next
        self.current_keyword = self.keywords[0]

        # The number that shows how many searches have been done by the worker
        self.search_number = 1

        # The parser that should be used to parse the search engine results
        self.parser = get_parser_by_search_engine(self.search_engine)()
        
        # The number of results per page
        self.num_results_per_page = Config['SCRAPING'].getint('num_results_per_page', 10)

        # The page where to start scraping. By default the starting page is 1.
        if start_page_pos:
            self.start_page_pos = 1 if start_page_pos < 1 else start_page_pos
        else:
            self.start_page_pos = Config['SCRAPING'].getint('search_offset', 1)

        # The page where we are right now
        self.current_page = self.start_page_pos
        
        # Install the proxy if one was provided
        self.proxy = proxy
        if isinstance(proxy, Proxy):
            self.set_proxy()
            self.ip = self.proxy.host + ':' + self.proxy.port
        else:
            self.ip = 'localhost'

        # the scraper_search object
        self.scraper_search = scraper_search
        
        # the scrape mode
        # to be set by subclasses
        self.scrapemethod = ''

        # set the database lock
        self.db_lock = db_lock

        # init the cache lock
        self.cache_lock = cache_lock

        # a queue to put an element in whenever a new keyword is scraped.
        # to visualize the progress
        self.progress_queue = progress_queue

        # set the session
        self.session = session

        # the current request time
        self.current_request_time = None

        # How long to sleep (in seconds) after every n-th request
        self.sleeping_ranges = dict()
        for line in Config['GLOBAL'].get('sleeping_ranges').split('\n'):
            assert line.count(':') == 1, 'Invalid sleep range format.'
            key, value = line.split(':')
            self.sleeping_ranges[int(key)] = tuple([int(offset.strip()) for offset in value.split(',')])

        # the output files. Either CSV or JSON
        # It's little bit tricky to write the JSON output file, since we need to
        # create the array of the most outer results ourselves because we write
        # results as soon as we get them (it's impossible to hold the whole search in memory).
        self.output_format = Config['GLOBAL'].get('output_format', 'stdout')
        self.output_file = Config['GLOBAL'].get('output_filename', 'google_scraper')
        if self.output_format == 'json':
            self.json_outfile = open(self.output_file + '.json', 'a')
            self.json_outfile.write('[')
        elif self.output_format == 'csv':
            self.csv_outfile = csv.DictWriter(open(self.output_file + '.csv', 'a'),
                    fieldnames=('link', 'title', 'snippet', 'visible_link', 'num_results',
                                'query', 'search_engine_name', 'requested_by',
                                'scrapemethod', 'page_number', 'requested_at'))
            self.csv_outfile.writeheader()
Beispiel #13
0
    def __init__(self, jobs=None, scraper_search=None, session=None, db_lock=None, cache_lock=None,
                 start_page_pos=1, search_engine=None, search_type=None, proxy=None, progress_queue=None):
        """Instantiate an SearchEngineScrape object.

        Args:
            TODO
        """
        jobs = jobs or {}
        self.search_engine_name = search_engine
        assert self.search_engine_name, 'You need to specify an search_engine'

        self.search_engine_name = self.search_engine_name.lower()

        if not search_type:
            self.search_type = Config['SCRAPING'].get('search_type', 'normal')
        else:
            self.search_type = search_type

        self.jobs = jobs

        # the keywords that couldn't be scraped by this worker
        self.missed_keywords = set()

        # the number of keywords
        self.num_keywords = len(self.jobs)

        # The actual keyword that is to be scraped next
        self.query = ''

        # The default pages per kewords
        self.pages_per_keyword = [1, ]

        # The number that shows how many searches have been done by the worker
        self.search_number = 1

        # The parser that should be used to parse the search engine results
        self.parser = get_parser_by_search_engine(self.search_engine_name)()

        # The number of results per page
        self.num_results_per_page = Config['SCRAPING'].getint('num_results_per_page', 10)

        # The page where to start scraping. By default the starting page is 1.
        if start_page_pos:
            self.start_page_pos = 1 if start_page_pos < 1 else start_page_pos
        else:
            self.start_page_pos = Config['SCRAPING'].getint('search_offset', 1)

        # The page where we are right now
        self.page_number = self.start_page_pos

        # Install the proxy if one was provided
        self.proxy = proxy
        if isinstance(proxy, Proxy):
            self.set_proxy()
            self.requested_by = self.proxy.host + ':' + self.proxy.port
        else:
            self.requested_by = 'localhost'

        # the scraper_search object
        self.scraper_search = scraper_search

        # the scrape mode
        # to be set by subclasses
        self.scrape_method = ''

        # Whether the instance is ready to run
        self.startable = True

        # set the database lock
        self.db_lock = db_lock

        # init the cache lock
        self.cache_lock = cache_lock

        # a queue to put an element in whenever a new keyword is scraped.
        # to visualize the progress
        self.progress_queue = progress_queue

        # set the session
        self.session = session

        # the current request time
        self.requested_at = None

        # The name of the scraper
        self.name = '[{}]'.format(self.search_engine_name) + self.__class__.__name__

        # How long to sleep (in seconds) after every n-th request
        self.sleeping_ranges = dict()
        sleep_ranges_option = Config['GLOBAL'].get(
            '{search_engine}_sleeping_ranges'.format(search_engine=self.search_engine_name),
            Config['GLOBAL'].get('sleeping_ranges'))

        for line in sleep_ranges_option.split('\n'):
            assert line.count(':') == 1, 'Invalid sleep range format.'
            key, value = line.split(':')
            self.sleeping_ranges[int(key)] = tuple([int(offset.strip()) for offset in value.split(',')])

        # the default timeout
        self.timeout = 5

        # the status of the thread after finishing or failing
        self.status = 'successful'

        self.html = ''
Beispiel #14
0
    def __init__(self,
                 keywords=None,
                 scraper_search=None,
                 session=None,
                 db_lock=None,
                 cache_lock=None,
                 start_page_pos=1,
                 search_engine=None,
                 search_type=None,
                 proxy=None,
                 progress_queue=None):
        """Instantiate an SearchEngineScrape object.

        Args:
            TODO
        """
        self.search_engine = search_engine
        assert self.search_engine, 'You need to specify an search_engine'

        self.search_engine = self.search_engine.lower()

        if not search_type:
            self.search_type = Config['SCRAPING'].get('search_type', 'normal')
        else:
            self.search_type = search_type

        # The number of pages to scrape for each keyword
        self.num_pages_per_keyword = Config['SCRAPING'].getint(
            'num_pages_for_keyword', 1)

        # The keywords that need to be scraped
        # If a SearchEngineScrape receives explicitly keywords,
        # scrape them. otherwise scrape the ones specified in the Config.
        if keywords:
            self.keywords = keywords
        else:
            self.keywords = Config['SCRAPING'].get('keywords', [])

        self.keywords = list(set(self.keywords))

        # the number of keywords
        self.num_keywords = len(self.keywords)

        # The actual keyword that is to be scraped next
        self.current_keyword = self.keywords[0]

        # The number that shows how many searches have been done by the worker
        self.search_number = 1

        # The parser that should be used to parse the search engine results
        self.parser = get_parser_by_search_engine(self.search_engine)()

        # The number of results per page
        self.num_results_per_page = Config['SCRAPING'].getint(
            'num_results_per_page', 10)

        # The page where to start scraping. By default the starting page is 1.
        if start_page_pos:
            self.start_page_pos = 1 if start_page_pos < 1 else start_page_pos
        else:
            self.start_page_pos = Config['SCRAPING'].getint('search_offset', 1)

        # The page where we are right now
        self.current_page = self.start_page_pos

        # Install the proxy if one was provided
        self.proxy = proxy
        if isinstance(proxy, Proxy):
            self.set_proxy()
            self.ip = self.proxy.host + ':' + self.proxy.port
        else:
            self.ip = 'localhost'

        # the scraper_search object
        self.scraper_search = scraper_search

        # the scrape mode
        # to be set by subclasses
        self.scrapemethod = ''

        # set the database lock
        self.db_lock = db_lock

        # init the cache lock
        self.cache_lock = cache_lock

        # a queue to put an element in whenever a new keyword is scraped.
        # to visualize the progress
        self.progress_queue = progress_queue

        # set the session
        self.session = session

        # the current request time
        self.current_request_time = None

        # How long to sleep (in seconds) after every n-th request
        self.sleeping_ranges = dict()
        for line in Config['GLOBAL'].get('sleeping_ranges').split('\n'):
            assert line.count(':') == 1, 'Invalid sleep range format.'
            key, value = line.split(':')
            self.sleeping_ranges[int(key)] = tuple(
                [int(offset.strip()) for offset in value.split(',')])

        # the output files. Either CSV or JSON
        # It's little bit tricky to write the JSON output file, since we need to
        # create the array of the most outer results ourselves because we write
        # results as soon as we get them (it's impossible to hold the whole search in memory).
        self.output_format = Config['GLOBAL'].get('output_format', 'stdout')
        self.output_file = Config['GLOBAL'].get('output_filename',
                                                'google_scraper')
        if self.output_format == 'json':
            self.json_outfile = open(self.output_file + '.json', 'a')
            self.json_outfile.write('[')
        elif self.output_format == 'csv':
            self.csv_outfile = csv.DictWriter(
                open(self.output_file + '.csv', 'a'),
                fieldnames=('link', 'title', 'snippet', 'visible_link',
                            'num_results', 'query', 'search_engine_name',
                            'requested_by', 'scrapemethod', 'page_number',
                            'requested_at'))
            self.csv_outfile.writeheader()
Beispiel #15
0
    def __init__(self, keywords=None, session=None, scraper_search=None, db_lock=None, cache_lock=None,
                 start_page_pos=1, search_engine=None, search_type=None, proxy=None):
        """Instantiate an SearchEngineScrape object.

        Args:
            TODO
        """
        if not search_engine:
            self.search_engine = Config['SCRAPING'].get('search_engine', 'google')
        else:
            self.search_engine = search_engine

        self.search_engine = self.search_engine.lower()

        if not search_type:
            self.search_type = Config['SCRAPING'].get('search_type', 'normal')
        else:
            self.search_type = search_type
            
        # The number of pages to scrape for each keyword
        self.num_pages_per_keyword = Config['SCRAPING'].getint('num_pages_for_keyword', 1)
        
        # The proxy to use
        self.proxy = proxy
        
        # The keywords that need to be scraped
        # If a SearchEngineScrape receives explicitly keywords,
        # scrape them. otherwise scrape the ones specified in the Config.
        if keywords:
            self.keywords = keywords
        else:
            self.keywords = Config['SCRAPING'].get('keywords', [])

        self.keywords = list(set(self.keywords))
        
        # The actual keyword that is to be scraped next
        self.current_keyword = self.keywords[0]

        # The parser that should be used to parse the search engine results
        self.parser = get_parser_by_search_engine(self.search_engine)()
        
        # The number of results per page
        self.num_results_per_page = Config['SCRAPING'].getint('num_results_per_page', 10)

        # The page where to start scraping. By default the starting page is 1.
        if start_page_pos:
            self.start_page_pos = 1 if start_page_pos < 1 else start_page_pos
        else:
            self.start_page_pos = Config['SCRAPING'].getint('search_offset', 1)

        # The page where we are right now
        self.current_page = self.start_page_pos
        
        # Install the proxy if one was provided
        self.proxy = proxy
        if proxy:
            self.set_proxy()
            self.ip = self.proxy.host
        else:
            self.ip = '127.0.0.1'

        # set the database scoped session
        self.session = session

        # the scraper_search object
        self.scraper_search = scraper_search

        # get the base search url based on the search engine.
        self.base_search_url = Config['SCRAPING'].get('{search_engine}_search_url'.format(search_engine=self.search_engine))
        
        # the scrape mode
        # to be set by subclasses
        self.scrapemethod = ''

        # set the database lock
        self.db_lock = db_lock

        # init the cache lock
        self.cache_lock = cache_lock
    def __init__(self,
                 jobs={},
                 scraper_search=None,
                 session=None,
                 db_lock=None,
                 cache_lock=None,
                 start_page_pos=1,
                 search_engine=None,
                 search_type=None,
                 proxy=None,
                 progress_queue=None):
        """Instantiate an SearchEngineScrape object.

        Args:
            TODO
        """
        self.search_engine_name = search_engine
        assert self.search_engine_name, 'You need to specify an search_engine'

        self.search_engine_name = self.search_engine_name.lower()

        if not search_type:
            self.search_type = Config['SCRAPING'].get('search_type', 'normal')
        else:
            self.search_type = search_type

        self.jobs = jobs

        # the keywords that couldn't be scraped by this worker
        self.missed_keywords = set()

        # the number of keywords
        self.num_keywords = len(self.jobs)

        # The actual keyword that is to be scraped next
        self.query = ''

        # The default pages per kewords
        self.pages_per_keyword = [
            1,
        ]

        # The number that shows how many searches have been done by the worker
        self.search_number = 1

        # The parser that should be used to parse the search engine results
        self.parser = get_parser_by_search_engine(self.search_engine_name)()

        # The number of results per page
        self.num_results_per_page = Config['SCRAPING'].getint(
            'num_results_per_page', 10)

        # The page where to start scraping. By default the starting page is 1.
        if start_page_pos:
            self.start_page_pos = 1 if start_page_pos < 1 else start_page_pos
        else:
            self.start_page_pos = Config['SCRAPING'].getint('search_offset', 1)

        # The page where we are right now
        self.page_number = self.start_page_pos

        # Install the proxy if one was provided
        self.proxy = proxy
        if isinstance(proxy, Proxy):
            self.set_proxy()
            self.requested_by = self.proxy.host + ':' + self.proxy.port
        else:
            self.requested_by = 'localhost'

        # the scraper_search object
        self.scraper_search = scraper_search

        # the scrape mode
        # to be set by subclasses
        self.scrape_method = ''

        # Whether the instance is ready to run
        self.startable = True

        # set the database lock
        self.db_lock = db_lock

        # init the cache lock
        self.cache_lock = cache_lock

        # a queue to put an element in whenever a new keyword is scraped.
        # to visualize the progress
        self.progress_queue = progress_queue

        # set the session
        self.session = session

        # the current request time
        self.requested_at = None

        # The name of the scraper
        self.name = '[{}]'.format(
            self.search_engine_name) + self.__class__.__name__

        # How long to sleep (in seconds) after every n-th request
        self.sleeping_ranges = dict()
        sleep_ranges_option = Config['GLOBAL'].get(
            '{search_engine}_sleeping_ranges'.format(
                search_engine=self.search_engine_name),
            Config['GLOBAL'].get('sleeping_ranges'))

        for line in sleep_ranges_option.split('\n'):
            assert line.count(':') == 1, 'Invalid sleep range format.'
            key, value = line.split(':')
            self.sleeping_ranges[int(key)] = tuple(
                [int(offset.strip()) for offset in value.split(',')])

        # the default timeout
        self.timeout = 5
Beispiel #17
0
    def __init__(self, keywords=None, scraper_search=None, session=None, db_lock=None, cache_lock=None,
                 start_page_pos=1, search_engine=None, search_type=None, proxy=None, progress_queue=None):
        """Instantiate an SearchEngineScrape object.

        Args:
            TODO
        """
        self.search_engine = search_engine
        assert self.search_engine, 'You need to specify an search_engine'

        self.search_engine = self.search_engine.lower()

        if not search_type:
            self.search_type = Config['SCRAPING'].get('search_type', 'normal')
        else:
            self.search_type = search_type
            
        # The number of pages to scrape for each keyword
        self.num_pages_per_keyword = Config['SCRAPING'].getint('num_pages_for_keyword', 1)
        
        # The keywords that need to be scraped
        # If a SearchEngineScrape receives explicitly keywords,
        # scrape them. otherwise scrape the ones specified in the Config.
        if keywords:
            self.keywords = keywords
        else:
            self.keywords = Config['SCRAPING'].get('keywords', [])

        self.keywords = list(set(self.keywords))

        # the number of keywords
        self.num_keywords = len(self.keywords)
        
        # The actual keyword that is to be scraped next
        self.current_keyword = self.keywords[0]

        # The number that shows how many searches have been done by the worker
        self.search_number = 1

        # The parser that should be used to parse the search engine results
        self.parser = get_parser_by_search_engine(self.search_engine)()
        
        # The number of results per page
        self.num_results_per_page = Config['SCRAPING'].getint('num_results_per_page', 10)

        # The page where to start scraping. By default the starting page is 1.
        if start_page_pos:
            self.start_page_pos = 1 if start_page_pos < 1 else start_page_pos
        else:
            self.start_page_pos = Config['SCRAPING'].getint('search_offset', 1)

        # The page where we are right now
        self.current_page = self.start_page_pos
        
        # Install the proxy if one was provided
        self.proxy = proxy
        if isinstance(proxy, Proxy):
            self.set_proxy()
            self.ip = self.proxy.host + ':' + self.proxy.port
        else:
            self.ip = 'localhost'

        # the scraper_search object
        self.scraper_search = scraper_search
        
        # the scrape mode
        # to be set by subclasses
        self.scrapemethod = ''

        # set the database lock
        self.db_lock = db_lock

        # init the cache lock
        self.cache_lock = cache_lock

        # a queue to put an element in whenever a new keyword is scraped.
        # to visualize the progress
        self.progress_queue = progress_queue

        # set the session
        self.session = session

        # the current request time
        self.current_request_time = None

        # How long to sleep (in seconds) after every n-th request
        self.sleeping_ranges = dict()
        for line in Config['GLOBAL'].get('sleeping_ranges').split('\n'):
            assert line.count(':') == 1, 'Invalid sleep range format.'
            key, value = line.split(':')
            self.sleeping_ranges[int(key)] = tuple([int(offset.strip()) for offset in value.split(',')])
Beispiel #18
0
    def __init__(self,
                 config,
                 cache_manager=None,
                 jobs=None,
                 scraper_search=None,
                 session=None,
                 db_lock=None,
                 cache_lock=None,
                 start_page_pos=1,
                 search_engine=None,
                 search_type=None,
                 proxy=None,
                 progress_queue=None):
        """Instantiate an SearchEngineScrape object.

        Args:
            TODO
        """
        # Set the config dictionary
        self.config = config

        # Set the cache manager
        self.cache_manager = cache_manager

        jobs = jobs or {}
        self.search_engine_name = search_engine
        assert self.search_engine_name, 'You need to specify an search_engine'

        self.search_engine_name = self.search_engine_name.lower()

        if not search_type:
            self.search_type = self.config.get('search_type', 'normal')
        else:
            self.search_type = search_type

        if self.search_engine_name == 'google' and self.search_type == 'image':
            self.search_engine_name = 'googleimg'

        self.jobs = jobs

        # the keywords that couldn't be scraped by this worker
        self.missed_keywords = set()

        # the number of keywords
        self.num_keywords = len(self.jobs)

        # The actual keyword that is to be scraped next
        self.query = ''

        # The default pages per keywords
        self.pages_per_keyword = [
            1,
        ]

        # The number that shows how many searches have been done by the worker
        self.search_number = 1

        # The parser that should be used to parse the search engine results
        self.parser = get_parser_by_search_engine(
            self.search_engine_name)(config=self.config)

        # The number of results per page
        self.num_results_per_page = int(
            self.config.get('num_results_per_page', 10))

        # The page where to start scraping. By default the starting page is 1.
        if start_page_pos:
            self.start_page_pos = 1 if start_page_pos < 1 else start_page_pos
        else:
            self.start_page_pos = int(self.config.get('search_offset', 1))

        # The page where we are right now
        self.page_number = self.start_page_pos

        # Install the proxy if one was provided
        self.proxy = proxy
        if isinstance(proxy, Proxy):
            self.set_proxy()
            self.requested_by = self.proxy.host + ':' + self.proxy.port
        else:
            self.requested_by = 'localhost'

        # the scraper_search object
        self.scraper_search = scraper_search

        # the scrape mode
        # to be set by subclasses
        self.scrape_method = ''

        # Whether the instance is ready to run
        self.startable = True

        # set the database lock
        self.db_lock = db_lock

        # init the cache lock
        self.cache_lock = cache_lock

        # a queue to put an element in whenever a new keyword is scraped.
        # to visualize the progress
        self.progress_queue = progress_queue

        # set the session
        self.session = session

        # the current request time
        self.requested_at = None

        # The name of the scraper
        self.name = '[{}]'.format(
            self.search_engine_name) + self.__class__.__name__

        # How long to sleep (in seconds) after every n-th request
        self.sleeping_ranges = dict()
        self.sleeping_ranges = self.config.get(
            '{search_engine}_sleeping_ranges'.format(
                search_engine=self.search_engine_name),
            self.config.get('sleeping_ranges'))

        # the default timeout
        self.timeout = 5

        # the status of the thread after finishing or failing
        self.status = 'successful'

        self.html = ''
Beispiel #19
0
    def __init__(self,
                 keywords=None,
                 scraper_search=None,
                 session=None,
                 db_lock=None,
                 cache_lock=None,
                 start_page_pos=1,
                 search_engine=None,
                 search_type=None,
                 proxy=None,
                 progress_queue=None):
        """Instantiate an SearchEngineScrape object.

        Args:
            TODO
        """
        self.search_engine = search_engine
        assert self.search_engine, 'You need to specify an search_engine'

        self.search_engine = self.search_engine.lower()

        if not search_type:
            self.search_type = Config['SCRAPING'].get('search_type', 'normal')
        else:
            self.search_type = search_type

        # The number of pages to scrape for each keyword
        self.num_pages_per_keyword = Config['SCRAPING'].getint(
            'num_pages_for_keyword', 1)

        # The keywords that need to be scraped
        # If a SearchEngineScrape receives explicitly keywords,
        # scrape them. otherwise scrape the ones specified in the Config.
        if keywords:
            self.keywords = keywords
        else:
            self.keywords = Config['SCRAPING'].get('keywords', [])

        self.keywords = list(set(self.keywords))

        # the number of keywords
        self.num_keywords = len(self.keywords)

        # The actual keyword that is to be scraped next
        self.current_keyword = self.keywords[0]

        # The number that shows how many searches have been done by the worker
        self.search_number = 1

        # The parser that should be used to parse the search engine results
        self.parser = get_parser_by_search_engine(self.search_engine)()

        # The number of results per page
        self.num_results_per_page = Config['SCRAPING'].getint(
            'num_results_per_page', 10)

        # The page where to start scraping. By default the starting page is 1.
        if start_page_pos:
            self.start_page_pos = 1 if start_page_pos < 1 else start_page_pos
        else:
            self.start_page_pos = Config['SCRAPING'].getint('search_offset', 1)

        # The page where we are right now
        self.current_page = self.start_page_pos

        # Install the proxy if one was provided
        self.proxy = proxy
        if isinstance(proxy, Proxy):
            self.set_proxy()
            self.ip = self.proxy.host + ':' + self.proxy.port
        else:
            self.ip = 'localhost'

        # the scraper_search object
        self.scraper_search = scraper_search

        # the scrape mode
        # to be set by subclasses
        self.scrapemethod = ''

        # set the database lock
        self.db_lock = db_lock

        # init the cache lock
        self.cache_lock = cache_lock

        # a queue to put an element in whenever a new keyword is scraped.
        # to visualize the progress
        self.progress_queue = progress_queue

        # set the session
        self.session = session

        # the current request time
        self.current_request_time = None

        # How long to sleep (in seconds) after every n-th request
        self.sleeping_ranges = dict()
        for line in Config['GLOBAL'].get('sleeping_ranges').split('\n'):
            assert line.count(':') == 1, 'Invalid sleep range format.'
            key, value = line.split(':')
            self.sleeping_ranges[int(key)] = tuple(
                [int(offset.strip()) for offset in value.split(',')])
Beispiel #20
0
    def __init__(self, config, cache_manager=None, jobs=None, scraper_search=None, session=None, db_lock=None, cache_lock=None,
                 start_page_pos=1, search_engine=None, search_type=None, proxy=None, progress_queue=None):
        """Instantiate an SearchEngineScrape object.

        Args:
            TODO
        """
        # Set the config dictionary
        self.config = config

        # Set the cache manager
        self.cache_manager = cache_manager

        jobs = jobs or {}
        self.search_engine_name = search_engine
        assert self.search_engine_name, 'You need to specify an search_engine'

        self.search_engine_name = self.search_engine_name.lower()

        if not search_type:
            self.search_type = self.config.get('search_type', 'normal')
        else:
            self.search_type = search_type

        self.jobs = jobs

        # the keywords that couldn't be scraped by this worker
        self.missed_keywords = set()

        # the number of queries to scrape
        self.num_keywords = len(self.jobs)

        # The actual keyword that is to be scraped next
        self.query = ''

        # The default pages per keywords
        self.pages_per_keyword = [1, ]

        # The number that shows how many searches have been done by the worker
        self.search_number = 1

        # The parser that should be used to parse the search engine results
        self.parser = get_parser_by_search_engine(self.search_engine_name)(config=self.config)

        # The number of results per page
        self.num_results_per_page = int(self.config.get('num_results_per_page', 10))

        # The page where to start scraping. By default the starting page is 1.
        if start_page_pos:
            self.start_page_pos = 1 if start_page_pos < 1 else start_page_pos
        else:
            self.start_page_pos = int(self.config.get('search_offset', 1))

        # The page where we are right now
        self.page_number = self.start_page_pos

        # Install the proxy if one was provided
        self.proxy = proxy
        if isinstance(proxy, Proxy):
            self.set_proxy()
            self.requested_by = self.proxy.host + ':' + self.proxy.port
        else:
            self.requested_by = 'localhost'

        # the scraper_search object
        self.scraper_search = scraper_search

        # the scrape mode
        # to be set by subclasses
        self.scrape_method = ''

        # Whether the instance is ready to run
        self.startable = True

        # set the database lock
        self.db_lock = db_lock

        # init the cache lock
        self.cache_lock = cache_lock

        # a queue to put an element in whenever a new keyword is scraped.
        # to visualize the progress
        self.progress_queue = progress_queue

        # set the session
        self.session = session

        # the current request time
        self.requested_at = None

        # The name of the scraper
        self.scraper_name = '{}-{}'.format(self.__class__.__name__, self.search_engine_name)

        # How long to sleep (in seconds) after every n-th request
        self.sleeping_ranges = dict()
        self.sleeping_ranges = self.config.get(
            '{search_engine}_sleeping_ranges'.format(search_engine=self.search_engine_name),
            self.config.get('sleeping_ranges'))

        assert sum(self.sleeping_ranges.keys()) == 100, 'The sum of the keys of sleeping_ranges must be 100!'

        # compute sleeping ranges
        self.sleeping_times = self._create_random_sleeping_intervals(self.num_keywords)
        logger.debug('Sleeping ranges: {}'.format(self.sleeping_times))

        # the default timeout
        self.timeout = 5

        # the status of the thread after finishing or failing
        self.status = 'successful'

        self.html = ''
Beispiel #21
0
    def __init__(self,
                 keywords=None,
                 session=None,
                 scaper_search=None,
                 start_page_pos=1,
                 search_engine=None,
                 search_type=None,
                 proxy=None):
        if not search_engine:
            self.search_engine = Config['SCRAPING'].get(
                'search_engine', 'Google')
        else:
            self.search_engine = search_engine

        self.search_engine = self.search_engine.lower()

        if not search_type:
            self.search_type = Config['SCRAPING'].get('search_type', 'normal')
        else:
            self.search_type = search_type

        # The number of pages to scrape for each keyword
        self.num_pages_per_keyword = Config['SCRAPING'].getint(
            'num_pages_for_keyword', 1)

        # The proxy to use
        self.proxy = proxy

        # The keywords that need to be scraped
        # If a SearchEngineScrape receives explicitly keywords,
        # scrape them. otherwise scrape the ones specified in the Config.
        if keywords:
            self.keywords = set(keywords)
        else:
            self.keywords = set(Config['SCRAPING'].get('keywords', []))

        if not isinstance(keywords, list):
            self.keywords = set([self.keywords])

        # The actual keyword that is to be scraped next
        self.current_keyword = self.next_keyword()

        # The parser that should be used to parse the search engine results
        self.parser = get_parser_by_search_engine(self.search_engine)()

        # The number of results per page
        self.num_results_per_page = Config['SCRAPING'].getint(
            'num_results_per_page', 10)

        # The page where to start scraping. By default the starting page is 1.
        if start_page_pos:
            self.start_page_pos = 1 if start_page_pos < 1 else start_page_pos
        else:
            self.start_page_pos = Config['SCRAPING'].getint('search_offset', 1)

        # The page where we are right now
        self.current_page = self.start_page_pos

        # Install the proxy if one was provided
        self.proxy = proxy
        if proxy:
            self.set_proxy()

        # set the database scoped session
        self.session = session

        # the scraper_search database object
        self.scraper_search = scaper_search

        # get the base search url based on the search engine.
        self.base_search_url = Config['SCRAPING'].get(
            '{search_engine}_search_url'.format(
                search_engine=self.search_engine))