def __init__(self, config, query='', page_number=1, search_engine='google', scrape_method='http-async'): """ """ self.config = config self.query = query self.page_number = page_number self.search_engine_name = search_engine self.search_type = 'normal' self.scrape_method = scrape_method self.requested_at = None self.requested_by = 'localhost' self.parser = get_parser_by_search_engine(self.search_engine_name) self.base_search_url = get_base_search_url_by_search_engine( self.config, self.search_engine_name, 'http') self.params = get_GET_params_for_search_engine( self.query, self.search_engine_name, search_type=self.search_type) self.headers = headers self.status = 'successful' self.num_results_per_page = int(config['num_results_per_page']) self.startRecord = self.num_results_per_page * (self.page_number - 1) + 1 self.stringStartRecord = "&first=" + str(self.startRecord)
def get_parser_for_file(self, se, file, **kwargs): with open(file, 'r') as f: html = f.read() parser = get_parser_by_search_engine(se) parser = parser(html, **kwargs) return parser
def get_parser_for_file(self, se, file, **kwargs): file = os.path.join(base, file) with open(file, 'r') as f: html = f.read() parser = get_parser_by_search_engine(se) parser = parser(config, html, **kwargs) return parser
def build_search(self): """Build the headers and params for the search request for the search engine.""" self.search_params = get_GET_params_for_search_engine(self.query, self.search_engine_name, self.page_number, self.num_results_per_page, self.search_type) self.parser = get_parser_by_search_engine(self.search_engine_name) self.parser = self.parser(config=self.config)
def build_search(self): """Build the headers and params for the search request for the search engine.""" self.search_params = get_GET_params_for_search_engine( self.query, self.search_engine_name, self.config, self.page_number, self.num_results_per_page, self.search_type) self.parser = get_parser_by_search_engine(self.search_engine_name) self.parser = self.parser(config=self.config)
def __init__(self, query='', page_number=1, search_engine='google', **kwargs): self.query = query self.page_number = page_number self.search_engine_name = search_engine self.search_type = 'normal' self.scrape_method = 'http-async' self.requested_at = None self.requested_by = '' self.parser = get_parser_by_search_engine(self.search_engine_name) self.base_search_url = get_base_search_url_by_search_engine(self.search_engine_name, 'http') self.params = get_GET_params_for_search_engine(self.query, self.search_engine_name, search_type=self.search_type) self.headers = headers
def __init__(self, config, query='', page_number=1, search_engine='google', scrape_method='http-async'): """ """ self.config = config self.query = query self.page_number = page_number self.search_engine_name = search_engine self.search_type = 'normal' self.scrape_method = scrape_method self.requested_at = None self.requested_by = 'localhost' self.parser = get_parser_by_search_engine(self.search_engine_name) self.base_search_url = get_base_search_url_by_search_engine(self.config, self.search_engine_name, 'http') self.params = get_GET_params_for_search_engine(self.query, self.search_engine_name, search_type=self.search_type) self.headers = headers self.status = 'successful'
def __init__(self, query='', page_number=1, search_engine='google', **kwargs): """ @todo: **kwargs doesn't seem to be used, check if any call to init passes additional keyword args and remove it """ self.query = query self.page_number = page_number self.search_engine_name = search_engine self.search_type = 'normal' self.scrape_method = 'http-async' self.requested_at = None self.requested_by = 'localhost' self.parser = get_parser_by_search_engine(self.search_engine_name) self.base_search_url = get_base_search_url_by_search_engine(self.search_engine_name, 'http') self.params = get_GET_params_for_search_engine(self.query, self.search_engine_name, search_type=self.search_type) self.headers = headers self.status = 'successful'
def __init__(self, config, query='', page_number=1, search_engine='google', scrape_method='http-async'): """ """ self.config = config self.query = query self.page_number = page_number self.search_engine_name = search_engine self.search_type = 'normal' self.scrape_method = scrape_method self.requested_at = None self.requested_by = 'localhost' self.parser = get_parser_by_search_engine(self.search_engine_name) self.base_search_url = get_base_search_url_by_search_engine( self.config, self.search_engine_name, 'http') self.params = get_GET_params_for_search_engine( self.query, self.search_engine_name, search_type=self.search_type) self.headers = headers self.status = 'successful'
def __init__(self, keywords=None, scraper_search=None, session=None, db_lock=None, cache_lock=None, start_page_pos=1, search_engine=None, search_type=None, proxy=None, progress_queue=None): """Instantiate an SearchEngineScrape object. Args: TODO """ self.search_engine = search_engine assert self.search_engine, 'You need to specify an search_engine' self.search_engine = self.search_engine.lower() if not search_type: self.search_type = Config['SCRAPING'].get('search_type', 'normal') else: self.search_type = search_type # The number of pages to scrape for each keyword self.num_pages_per_keyword = Config['SCRAPING'].getint('num_pages_for_keyword', 1) # The keywords that need to be scraped # If a SearchEngineScrape receives explicitly keywords, # scrape them. otherwise scrape the ones specified in the Config. if keywords: self.keywords = keywords else: self.keywords = Config['SCRAPING'].get('keywords', []) self.keywords = list(set(self.keywords)) # the number of keywords self.num_keywords = len(self.keywords) # The actual keyword that is to be scraped next self.current_keyword = self.keywords[0] # The number that shows how many searches have been done by the worker self.search_number = 1 # The parser that should be used to parse the search engine results self.parser = get_parser_by_search_engine(self.search_engine)() # The number of results per page self.num_results_per_page = Config['SCRAPING'].getint('num_results_per_page', 10) # The page where to start scraping. By default the starting page is 1. if start_page_pos: self.start_page_pos = 1 if start_page_pos < 1 else start_page_pos else: self.start_page_pos = Config['SCRAPING'].getint('search_offset', 1) # The page where we are right now self.current_page = self.start_page_pos # Install the proxy if one was provided self.proxy = proxy if isinstance(proxy, Proxy): self.set_proxy() self.ip = self.proxy.host + ':' + self.proxy.port else: self.ip = 'localhost' # the scraper_search object self.scraper_search = scraper_search # the scrape mode # to be set by subclasses self.scrapemethod = '' # set the database lock self.db_lock = db_lock # init the cache lock self.cache_lock = cache_lock # a queue to put an element in whenever a new keyword is scraped. # to visualize the progress self.progress_queue = progress_queue # set the session self.session = session # the current request time self.current_request_time = None # How long to sleep (in seconds) after every n-th request self.sleeping_ranges = dict() for line in Config['GLOBAL'].get('sleeping_ranges').split('\n'): assert line.count(':') == 1, 'Invalid sleep range format.' key, value = line.split(':') self.sleeping_ranges[int(key)] = tuple([int(offset.strip()) for offset in value.split(',')]) # the output files. Either CSV or JSON # It's little bit tricky to write the JSON output file, since we need to # create the array of the most outer results ourselves because we write # results as soon as we get them (it's impossible to hold the whole search in memory). self.output_format = Config['GLOBAL'].get('output_format', 'stdout') self.output_file = Config['GLOBAL'].get('output_filename', 'google_scraper') if self.output_format == 'json': self.json_outfile = open(self.output_file + '.json', 'a') self.json_outfile.write('[') elif self.output_format == 'csv': self.csv_outfile = csv.DictWriter(open(self.output_file + '.csv', 'a'), fieldnames=('link', 'title', 'snippet', 'visible_link', 'num_results', 'query', 'search_engine_name', 'requested_by', 'scrapemethod', 'page_number', 'requested_at')) self.csv_outfile.writeheader()
def __init__(self, jobs=None, scraper_search=None, session=None, db_lock=None, cache_lock=None, start_page_pos=1, search_engine=None, search_type=None, proxy=None, progress_queue=None): """Instantiate an SearchEngineScrape object. Args: TODO """ jobs = jobs or {} self.search_engine_name = search_engine assert self.search_engine_name, 'You need to specify an search_engine' self.search_engine_name = self.search_engine_name.lower() if not search_type: self.search_type = Config['SCRAPING'].get('search_type', 'normal') else: self.search_type = search_type self.jobs = jobs # the keywords that couldn't be scraped by this worker self.missed_keywords = set() # the number of keywords self.num_keywords = len(self.jobs) # The actual keyword that is to be scraped next self.query = '' # The default pages per kewords self.pages_per_keyword = [1, ] # The number that shows how many searches have been done by the worker self.search_number = 1 # The parser that should be used to parse the search engine results self.parser = get_parser_by_search_engine(self.search_engine_name)() # The number of results per page self.num_results_per_page = Config['SCRAPING'].getint('num_results_per_page', 10) # The page where to start scraping. By default the starting page is 1. if start_page_pos: self.start_page_pos = 1 if start_page_pos < 1 else start_page_pos else: self.start_page_pos = Config['SCRAPING'].getint('search_offset', 1) # The page where we are right now self.page_number = self.start_page_pos # Install the proxy if one was provided self.proxy = proxy if isinstance(proxy, Proxy): self.set_proxy() self.requested_by = self.proxy.host + ':' + self.proxy.port else: self.requested_by = 'localhost' # the scraper_search object self.scraper_search = scraper_search # the scrape mode # to be set by subclasses self.scrape_method = '' # Whether the instance is ready to run self.startable = True # set the database lock self.db_lock = db_lock # init the cache lock self.cache_lock = cache_lock # a queue to put an element in whenever a new keyword is scraped. # to visualize the progress self.progress_queue = progress_queue # set the session self.session = session # the current request time self.requested_at = None # The name of the scraper self.name = '[{}]'.format(self.search_engine_name) + self.__class__.__name__ # How long to sleep (in seconds) after every n-th request self.sleeping_ranges = dict() sleep_ranges_option = Config['GLOBAL'].get( '{search_engine}_sleeping_ranges'.format(search_engine=self.search_engine_name), Config['GLOBAL'].get('sleeping_ranges')) for line in sleep_ranges_option.split('\n'): assert line.count(':') == 1, 'Invalid sleep range format.' key, value = line.split(':') self.sleeping_ranges[int(key)] = tuple([int(offset.strip()) for offset in value.split(',')]) # the default timeout self.timeout = 5 # the status of the thread after finishing or failing self.status = 'successful' self.html = ''
def __init__(self, keywords=None, scraper_search=None, session=None, db_lock=None, cache_lock=None, start_page_pos=1, search_engine=None, search_type=None, proxy=None, progress_queue=None): """Instantiate an SearchEngineScrape object. Args: TODO """ self.search_engine = search_engine assert self.search_engine, 'You need to specify an search_engine' self.search_engine = self.search_engine.lower() if not search_type: self.search_type = Config['SCRAPING'].get('search_type', 'normal') else: self.search_type = search_type # The number of pages to scrape for each keyword self.num_pages_per_keyword = Config['SCRAPING'].getint( 'num_pages_for_keyword', 1) # The keywords that need to be scraped # If a SearchEngineScrape receives explicitly keywords, # scrape them. otherwise scrape the ones specified in the Config. if keywords: self.keywords = keywords else: self.keywords = Config['SCRAPING'].get('keywords', []) self.keywords = list(set(self.keywords)) # the number of keywords self.num_keywords = len(self.keywords) # The actual keyword that is to be scraped next self.current_keyword = self.keywords[0] # The number that shows how many searches have been done by the worker self.search_number = 1 # The parser that should be used to parse the search engine results self.parser = get_parser_by_search_engine(self.search_engine)() # The number of results per page self.num_results_per_page = Config['SCRAPING'].getint( 'num_results_per_page', 10) # The page where to start scraping. By default the starting page is 1. if start_page_pos: self.start_page_pos = 1 if start_page_pos < 1 else start_page_pos else: self.start_page_pos = Config['SCRAPING'].getint('search_offset', 1) # The page where we are right now self.current_page = self.start_page_pos # Install the proxy if one was provided self.proxy = proxy if isinstance(proxy, Proxy): self.set_proxy() self.ip = self.proxy.host + ':' + self.proxy.port else: self.ip = 'localhost' # the scraper_search object self.scraper_search = scraper_search # the scrape mode # to be set by subclasses self.scrapemethod = '' # set the database lock self.db_lock = db_lock # init the cache lock self.cache_lock = cache_lock # a queue to put an element in whenever a new keyword is scraped. # to visualize the progress self.progress_queue = progress_queue # set the session self.session = session # the current request time self.current_request_time = None # How long to sleep (in seconds) after every n-th request self.sleeping_ranges = dict() for line in Config['GLOBAL'].get('sleeping_ranges').split('\n'): assert line.count(':') == 1, 'Invalid sleep range format.' key, value = line.split(':') self.sleeping_ranges[int(key)] = tuple( [int(offset.strip()) for offset in value.split(',')]) # the output files. Either CSV or JSON # It's little bit tricky to write the JSON output file, since we need to # create the array of the most outer results ourselves because we write # results as soon as we get them (it's impossible to hold the whole search in memory). self.output_format = Config['GLOBAL'].get('output_format', 'stdout') self.output_file = Config['GLOBAL'].get('output_filename', 'google_scraper') if self.output_format == 'json': self.json_outfile = open(self.output_file + '.json', 'a') self.json_outfile.write('[') elif self.output_format == 'csv': self.csv_outfile = csv.DictWriter( open(self.output_file + '.csv', 'a'), fieldnames=('link', 'title', 'snippet', 'visible_link', 'num_results', 'query', 'search_engine_name', 'requested_by', 'scrapemethod', 'page_number', 'requested_at')) self.csv_outfile.writeheader()
def __init__(self, keywords=None, session=None, scraper_search=None, db_lock=None, cache_lock=None, start_page_pos=1, search_engine=None, search_type=None, proxy=None): """Instantiate an SearchEngineScrape object. Args: TODO """ if not search_engine: self.search_engine = Config['SCRAPING'].get('search_engine', 'google') else: self.search_engine = search_engine self.search_engine = self.search_engine.lower() if not search_type: self.search_type = Config['SCRAPING'].get('search_type', 'normal') else: self.search_type = search_type # The number of pages to scrape for each keyword self.num_pages_per_keyword = Config['SCRAPING'].getint('num_pages_for_keyword', 1) # The proxy to use self.proxy = proxy # The keywords that need to be scraped # If a SearchEngineScrape receives explicitly keywords, # scrape them. otherwise scrape the ones specified in the Config. if keywords: self.keywords = keywords else: self.keywords = Config['SCRAPING'].get('keywords', []) self.keywords = list(set(self.keywords)) # The actual keyword that is to be scraped next self.current_keyword = self.keywords[0] # The parser that should be used to parse the search engine results self.parser = get_parser_by_search_engine(self.search_engine)() # The number of results per page self.num_results_per_page = Config['SCRAPING'].getint('num_results_per_page', 10) # The page where to start scraping. By default the starting page is 1. if start_page_pos: self.start_page_pos = 1 if start_page_pos < 1 else start_page_pos else: self.start_page_pos = Config['SCRAPING'].getint('search_offset', 1) # The page where we are right now self.current_page = self.start_page_pos # Install the proxy if one was provided self.proxy = proxy if proxy: self.set_proxy() self.ip = self.proxy.host else: self.ip = '127.0.0.1' # set the database scoped session self.session = session # the scraper_search object self.scraper_search = scraper_search # get the base search url based on the search engine. self.base_search_url = Config['SCRAPING'].get('{search_engine}_search_url'.format(search_engine=self.search_engine)) # the scrape mode # to be set by subclasses self.scrapemethod = '' # set the database lock self.db_lock = db_lock # init the cache lock self.cache_lock = cache_lock
def __init__(self, jobs={}, scraper_search=None, session=None, db_lock=None, cache_lock=None, start_page_pos=1, search_engine=None, search_type=None, proxy=None, progress_queue=None): """Instantiate an SearchEngineScrape object. Args: TODO """ self.search_engine_name = search_engine assert self.search_engine_name, 'You need to specify an search_engine' self.search_engine_name = self.search_engine_name.lower() if not search_type: self.search_type = Config['SCRAPING'].get('search_type', 'normal') else: self.search_type = search_type self.jobs = jobs # the keywords that couldn't be scraped by this worker self.missed_keywords = set() # the number of keywords self.num_keywords = len(self.jobs) # The actual keyword that is to be scraped next self.query = '' # The default pages per kewords self.pages_per_keyword = [ 1, ] # The number that shows how many searches have been done by the worker self.search_number = 1 # The parser that should be used to parse the search engine results self.parser = get_parser_by_search_engine(self.search_engine_name)() # The number of results per page self.num_results_per_page = Config['SCRAPING'].getint( 'num_results_per_page', 10) # The page where to start scraping. By default the starting page is 1. if start_page_pos: self.start_page_pos = 1 if start_page_pos < 1 else start_page_pos else: self.start_page_pos = Config['SCRAPING'].getint('search_offset', 1) # The page where we are right now self.page_number = self.start_page_pos # Install the proxy if one was provided self.proxy = proxy if isinstance(proxy, Proxy): self.set_proxy() self.requested_by = self.proxy.host + ':' + self.proxy.port else: self.requested_by = 'localhost' # the scraper_search object self.scraper_search = scraper_search # the scrape mode # to be set by subclasses self.scrape_method = '' # Whether the instance is ready to run self.startable = True # set the database lock self.db_lock = db_lock # init the cache lock self.cache_lock = cache_lock # a queue to put an element in whenever a new keyword is scraped. # to visualize the progress self.progress_queue = progress_queue # set the session self.session = session # the current request time self.requested_at = None # The name of the scraper self.name = '[{}]'.format( self.search_engine_name) + self.__class__.__name__ # How long to sleep (in seconds) after every n-th request self.sleeping_ranges = dict() sleep_ranges_option = Config['GLOBAL'].get( '{search_engine}_sleeping_ranges'.format( search_engine=self.search_engine_name), Config['GLOBAL'].get('sleeping_ranges')) for line in sleep_ranges_option.split('\n'): assert line.count(':') == 1, 'Invalid sleep range format.' key, value = line.split(':') self.sleeping_ranges[int(key)] = tuple( [int(offset.strip()) for offset in value.split(',')]) # the default timeout self.timeout = 5
def __init__(self, keywords=None, scraper_search=None, session=None, db_lock=None, cache_lock=None, start_page_pos=1, search_engine=None, search_type=None, proxy=None, progress_queue=None): """Instantiate an SearchEngineScrape object. Args: TODO """ self.search_engine = search_engine assert self.search_engine, 'You need to specify an search_engine' self.search_engine = self.search_engine.lower() if not search_type: self.search_type = Config['SCRAPING'].get('search_type', 'normal') else: self.search_type = search_type # The number of pages to scrape for each keyword self.num_pages_per_keyword = Config['SCRAPING'].getint('num_pages_for_keyword', 1) # The keywords that need to be scraped # If a SearchEngineScrape receives explicitly keywords, # scrape them. otherwise scrape the ones specified in the Config. if keywords: self.keywords = keywords else: self.keywords = Config['SCRAPING'].get('keywords', []) self.keywords = list(set(self.keywords)) # the number of keywords self.num_keywords = len(self.keywords) # The actual keyword that is to be scraped next self.current_keyword = self.keywords[0] # The number that shows how many searches have been done by the worker self.search_number = 1 # The parser that should be used to parse the search engine results self.parser = get_parser_by_search_engine(self.search_engine)() # The number of results per page self.num_results_per_page = Config['SCRAPING'].getint('num_results_per_page', 10) # The page where to start scraping. By default the starting page is 1. if start_page_pos: self.start_page_pos = 1 if start_page_pos < 1 else start_page_pos else: self.start_page_pos = Config['SCRAPING'].getint('search_offset', 1) # The page where we are right now self.current_page = self.start_page_pos # Install the proxy if one was provided self.proxy = proxy if isinstance(proxy, Proxy): self.set_proxy() self.ip = self.proxy.host + ':' + self.proxy.port else: self.ip = 'localhost' # the scraper_search object self.scraper_search = scraper_search # the scrape mode # to be set by subclasses self.scrapemethod = '' # set the database lock self.db_lock = db_lock # init the cache lock self.cache_lock = cache_lock # a queue to put an element in whenever a new keyword is scraped. # to visualize the progress self.progress_queue = progress_queue # set the session self.session = session # the current request time self.current_request_time = None # How long to sleep (in seconds) after every n-th request self.sleeping_ranges = dict() for line in Config['GLOBAL'].get('sleeping_ranges').split('\n'): assert line.count(':') == 1, 'Invalid sleep range format.' key, value = line.split(':') self.sleeping_ranges[int(key)] = tuple([int(offset.strip()) for offset in value.split(',')])
def __init__(self, config, cache_manager=None, jobs=None, scraper_search=None, session=None, db_lock=None, cache_lock=None, start_page_pos=1, search_engine=None, search_type=None, proxy=None, progress_queue=None): """Instantiate an SearchEngineScrape object. Args: TODO """ # Set the config dictionary self.config = config # Set the cache manager self.cache_manager = cache_manager jobs = jobs or {} self.search_engine_name = search_engine assert self.search_engine_name, 'You need to specify an search_engine' self.search_engine_name = self.search_engine_name.lower() if not search_type: self.search_type = self.config.get('search_type', 'normal') else: self.search_type = search_type if self.search_engine_name == 'google' and self.search_type == 'image': self.search_engine_name = 'googleimg' self.jobs = jobs # the keywords that couldn't be scraped by this worker self.missed_keywords = set() # the number of keywords self.num_keywords = len(self.jobs) # The actual keyword that is to be scraped next self.query = '' # The default pages per keywords self.pages_per_keyword = [ 1, ] # The number that shows how many searches have been done by the worker self.search_number = 1 # The parser that should be used to parse the search engine results self.parser = get_parser_by_search_engine( self.search_engine_name)(config=self.config) # The number of results per page self.num_results_per_page = int( self.config.get('num_results_per_page', 10)) # The page where to start scraping. By default the starting page is 1. if start_page_pos: self.start_page_pos = 1 if start_page_pos < 1 else start_page_pos else: self.start_page_pos = int(self.config.get('search_offset', 1)) # The page where we are right now self.page_number = self.start_page_pos # Install the proxy if one was provided self.proxy = proxy if isinstance(proxy, Proxy): self.set_proxy() self.requested_by = self.proxy.host + ':' + self.proxy.port else: self.requested_by = 'localhost' # the scraper_search object self.scraper_search = scraper_search # the scrape mode # to be set by subclasses self.scrape_method = '' # Whether the instance is ready to run self.startable = True # set the database lock self.db_lock = db_lock # init the cache lock self.cache_lock = cache_lock # a queue to put an element in whenever a new keyword is scraped. # to visualize the progress self.progress_queue = progress_queue # set the session self.session = session # the current request time self.requested_at = None # The name of the scraper self.name = '[{}]'.format( self.search_engine_name) + self.__class__.__name__ # How long to sleep (in seconds) after every n-th request self.sleeping_ranges = dict() self.sleeping_ranges = self.config.get( '{search_engine}_sleeping_ranges'.format( search_engine=self.search_engine_name), self.config.get('sleeping_ranges')) # the default timeout self.timeout = 5 # the status of the thread after finishing or failing self.status = 'successful' self.html = ''
def __init__(self, keywords=None, scraper_search=None, session=None, db_lock=None, cache_lock=None, start_page_pos=1, search_engine=None, search_type=None, proxy=None, progress_queue=None): """Instantiate an SearchEngineScrape object. Args: TODO """ self.search_engine = search_engine assert self.search_engine, 'You need to specify an search_engine' self.search_engine = self.search_engine.lower() if not search_type: self.search_type = Config['SCRAPING'].get('search_type', 'normal') else: self.search_type = search_type # The number of pages to scrape for each keyword self.num_pages_per_keyword = Config['SCRAPING'].getint( 'num_pages_for_keyword', 1) # The keywords that need to be scraped # If a SearchEngineScrape receives explicitly keywords, # scrape them. otherwise scrape the ones specified in the Config. if keywords: self.keywords = keywords else: self.keywords = Config['SCRAPING'].get('keywords', []) self.keywords = list(set(self.keywords)) # the number of keywords self.num_keywords = len(self.keywords) # The actual keyword that is to be scraped next self.current_keyword = self.keywords[0] # The number that shows how many searches have been done by the worker self.search_number = 1 # The parser that should be used to parse the search engine results self.parser = get_parser_by_search_engine(self.search_engine)() # The number of results per page self.num_results_per_page = Config['SCRAPING'].getint( 'num_results_per_page', 10) # The page where to start scraping. By default the starting page is 1. if start_page_pos: self.start_page_pos = 1 if start_page_pos < 1 else start_page_pos else: self.start_page_pos = Config['SCRAPING'].getint('search_offset', 1) # The page where we are right now self.current_page = self.start_page_pos # Install the proxy if one was provided self.proxy = proxy if isinstance(proxy, Proxy): self.set_proxy() self.ip = self.proxy.host + ':' + self.proxy.port else: self.ip = 'localhost' # the scraper_search object self.scraper_search = scraper_search # the scrape mode # to be set by subclasses self.scrapemethod = '' # set the database lock self.db_lock = db_lock # init the cache lock self.cache_lock = cache_lock # a queue to put an element in whenever a new keyword is scraped. # to visualize the progress self.progress_queue = progress_queue # set the session self.session = session # the current request time self.current_request_time = None # How long to sleep (in seconds) after every n-th request self.sleeping_ranges = dict() for line in Config['GLOBAL'].get('sleeping_ranges').split('\n'): assert line.count(':') == 1, 'Invalid sleep range format.' key, value = line.split(':') self.sleeping_ranges[int(key)] = tuple( [int(offset.strip()) for offset in value.split(',')])
def __init__(self, config, cache_manager=None, jobs=None, scraper_search=None, session=None, db_lock=None, cache_lock=None, start_page_pos=1, search_engine=None, search_type=None, proxy=None, progress_queue=None): """Instantiate an SearchEngineScrape object. Args: TODO """ # Set the config dictionary self.config = config # Set the cache manager self.cache_manager = cache_manager jobs = jobs or {} self.search_engine_name = search_engine assert self.search_engine_name, 'You need to specify an search_engine' self.search_engine_name = self.search_engine_name.lower() if not search_type: self.search_type = self.config.get('search_type', 'normal') else: self.search_type = search_type self.jobs = jobs # the keywords that couldn't be scraped by this worker self.missed_keywords = set() # the number of queries to scrape self.num_keywords = len(self.jobs) # The actual keyword that is to be scraped next self.query = '' # The default pages per keywords self.pages_per_keyword = [1, ] # The number that shows how many searches have been done by the worker self.search_number = 1 # The parser that should be used to parse the search engine results self.parser = get_parser_by_search_engine(self.search_engine_name)(config=self.config) # The number of results per page self.num_results_per_page = int(self.config.get('num_results_per_page', 10)) # The page where to start scraping. By default the starting page is 1. if start_page_pos: self.start_page_pos = 1 if start_page_pos < 1 else start_page_pos else: self.start_page_pos = int(self.config.get('search_offset', 1)) # The page where we are right now self.page_number = self.start_page_pos # Install the proxy if one was provided self.proxy = proxy if isinstance(proxy, Proxy): self.set_proxy() self.requested_by = self.proxy.host + ':' + self.proxy.port else: self.requested_by = 'localhost' # the scraper_search object self.scraper_search = scraper_search # the scrape mode # to be set by subclasses self.scrape_method = '' # Whether the instance is ready to run self.startable = True # set the database lock self.db_lock = db_lock # init the cache lock self.cache_lock = cache_lock # a queue to put an element in whenever a new keyword is scraped. # to visualize the progress self.progress_queue = progress_queue # set the session self.session = session # the current request time self.requested_at = None # The name of the scraper self.scraper_name = '{}-{}'.format(self.__class__.__name__, self.search_engine_name) # How long to sleep (in seconds) after every n-th request self.sleeping_ranges = dict() self.sleeping_ranges = self.config.get( '{search_engine}_sleeping_ranges'.format(search_engine=self.search_engine_name), self.config.get('sleeping_ranges')) assert sum(self.sleeping_ranges.keys()) == 100, 'The sum of the keys of sleeping_ranges must be 100!' # compute sleeping ranges self.sleeping_times = self._create_random_sleeping_intervals(self.num_keywords) logger.debug('Sleeping ranges: {}'.format(self.sleeping_times)) # the default timeout self.timeout = 5 # the status of the thread after finishing or failing self.status = 'successful' self.html = ''
def __init__(self, keywords=None, session=None, scaper_search=None, start_page_pos=1, search_engine=None, search_type=None, proxy=None): if not search_engine: self.search_engine = Config['SCRAPING'].get( 'search_engine', 'Google') else: self.search_engine = search_engine self.search_engine = self.search_engine.lower() if not search_type: self.search_type = Config['SCRAPING'].get('search_type', 'normal') else: self.search_type = search_type # The number of pages to scrape for each keyword self.num_pages_per_keyword = Config['SCRAPING'].getint( 'num_pages_for_keyword', 1) # The proxy to use self.proxy = proxy # The keywords that need to be scraped # If a SearchEngineScrape receives explicitly keywords, # scrape them. otherwise scrape the ones specified in the Config. if keywords: self.keywords = set(keywords) else: self.keywords = set(Config['SCRAPING'].get('keywords', [])) if not isinstance(keywords, list): self.keywords = set([self.keywords]) # The actual keyword that is to be scraped next self.current_keyword = self.next_keyword() # The parser that should be used to parse the search engine results self.parser = get_parser_by_search_engine(self.search_engine)() # The number of results per page self.num_results_per_page = Config['SCRAPING'].getint( 'num_results_per_page', 10) # The page where to start scraping. By default the starting page is 1. if start_page_pos: self.start_page_pos = 1 if start_page_pos < 1 else start_page_pos else: self.start_page_pos = Config['SCRAPING'].getint('search_offset', 1) # The page where we are right now self.current_page = self.start_page_pos # Install the proxy if one was provided self.proxy = proxy if proxy: self.set_proxy() # set the database scoped session self.session = session # the scraper_search database object self.scraper_search = scaper_search # get the base search url based on the search engine. self.base_search_url = Config['SCRAPING'].get( '{search_engine}_search_url'.format( search_engine=self.search_engine))