def start_bootstrapping(self): """ Main Method - Iterates over all categories, keywords, and misc words, scrapes the urls of all the search results and store them into mongodb """ # Arguments Parsing args_parser = self.get_arguments_parser() self._args = vars(args_parser.parse_args()) # Log Handler Configuring self._logger = Utils.configure_log(self._args) # MongoDB Configuring if not Utils.configure_mongodb(self,**self._params): self._logger.fatal('Error configuring MongoDB') sys.exit(errno.ECONNREFUSED) # Loads different "seed" terms from the input xml file received bs_seed = BootstrappingSeed.Seed(self._args['bootstrapping-terms']) bs_seed.initialize_seed_class() # Checking for proxies if self._args['proxies_path']: self._proxies = Utils.load_proxies(self._args) print 'Loaded Proxies : %d' % len(self._proxies) else: self._proxies = None # if "Debug Http" is set to true, "verify" must be "false" self._verify_certificate = not self._args['debug_https'] # Request for each top level category for top_level_category in bs_seed._top_level_categories: self.crawl_category(top_level_category) # Simulating searches for specific words for word in bs_seed.get_words(): self.crawl_by_search_word(word)
def start_bootstrapping(self): """ Main Method - Iterates over all categories, keywords, and misc words, scrapes the urls of all the search results and store them into mongodb """ # Arguments Parsing args_parser = self.get_arguments_parser() self._args = vars(args_parser.parse_args()) # Log Handler Configuring self._logger = Utils.configure_log(self._args) # MongoDB Configuring if not Utils.configure_mongodb(self, **self._params): self._logger.fatal('Error configuring MongoDB') sys.exit(errno.ECONNREFUSED) # Loads different "seed" terms from the input xml file received bs_seed = BootstrappingSeed.Seed(self._args['bootstrapping-terms']) bs_seed.initialize_seed_class() # Checking for proxies if self._args['proxies_path']: self._proxies = Utils.load_proxies(self._args) print 'Loaded Proxies : %d' % len(self._proxies) else: self._proxies = None # if "Debug Http" is set to true, "verify" must be "false" self._verify_certificate = not self._args['debug_https'] # Request for each top level category for top_level_category in bs_seed._top_level_categories: self.crawl_category(top_level_category) # Simulating searches for specific words for word in bs_seed.get_words(): self.crawl_by_search_word(word)
def scrape_apps(self): """ Main method of the 'Worker' layer of this project. This method starts the distributed working phase which will consume urls from the seed database and scrape apps data out of the html pages, storing the result into the apps_data collection on MongoDB """ # Arguments Parsing args_parser = self.get_arguments_parser() self._args = vars(args_parser.parse_args()) # Log Handler Configuring self._logger = Utils.configure_log(self._args) # MongoDB Configuring if not Utils.configure_mongodb(self,**self._params): self._logger.fatal('Error configuring MongoDB') sys.exit(errno.ECONNREFUSED) # Making sure indexes exist self._mongo_wrapper.ensure_index('IsBusy'); self._mongo_wrapper.ensure_index('_id', self._params['apps_collection']) # Proxies Loading self._proxies = Utils.load_proxies(self._args) # if "Debug Http" is set to true, "verify" must be "false" self._verify_certificate = not self._args['debug_https'] self._is_using_proxies = self._proxies != None # Control Variables - Used on the 'retrying logic' retries, max_retries = 0, 8 parser = html_parser() # Loop only breaks when there are no more apps to be processed while True: # Finds an app to be processed and toggles it's state to 'Busy' seed_record = self._mongo_wrapper.find_and_modify() if not seed_record: break try: url = seed_record['_id'] # Do we need to normalize the url ? if 'http://' not in url and 'https://' not in url: url = 'https://play.google.com' + url self._logger.info('Processing: %s' % url) # Is this app processed already ? if self._mongo_wrapper.app_processed(url, self._params['apps_collection']): self._logger.info('Duplicated App : %s. Skipped' % url) self._mongo_wrapper.remove_app_from_queue(seed_record) continue # Get Request for the App's Page response = requests.get(url, HTTPUtils.headers, verify=self._verify_certificate, proxies=Utils.get_proxy(self)) # Sanity Checks on Response if not response.text or response.status_code != requests.codes.ok: self._logger.info('Error Opening App Page : %s' % url) retries += 1 # Retries logic are different if proxies are being used if self._is_using_proxies: Utils.sleep() try: # Scraping Data from HTML app = parser.parse_app_data(response.text) # Stamping URL into app model app['Url'] = url app['_id'] = url # Reaching related apps related_apps = parser.parse_related_apps(response.text) if not related_apps: app['RelatedUrls'] = None else: app['RelatedUrls'] = related_apps self._logger.info('Related Apps: %s - %d' % (url, len(related_apps))) # Inserting data into MongoDB self._mongo_wrapper._insert(app, self._params['apps_collection']) # Re-Feeding seed collection with related-app urls if app['RelatedUrls']: for url in app['RelatedUrls']: if not self._mongo_wrapper.app_processed(url, self._params['apps_collection']) and \ not self._mongo_wrapper.app_processed(url, self._params['seed_collection']): self._mongo_wrapper.insert_on_queue(url, self._params['seed_collection']) except Exception as exception: self._logger.error(exception) # Toggling app state back to false self._mongo_wrapper.toggle_app_busy(url,False, self._params['seed_collection']) except Exception as exception: self._logger.error(exception)
def crawl_by_search_word(self, word): """ Simulates an app search on the play store, using the word argument as the term to be searched. Paginates through all the results found and store the unique urls into the MongoDB seed collection """ self._logger.info('Scraping links of Word : %s' % word) parsed_urls = set() # Compiling regex used for parsing page token page_token_regex = regex.compile(r"GAEi+.+\:S\:.{11}\\42,") post_url = self.assemble_post_url(word) post_data = self.assemble_word_search_post_data() http_errors = 0 while http_errors <= self._args['max_errors']: try: response = requests.post(post_url, data=post_data, headers=HTTPUtils.headers, verify=self._verify_certificate, proxies=Utils.get_proxy()) if response.status_code != requests.codes.ok: http_errors += 1 #Utils.sleep(http_errors) self._logger.critical('Error [%d] on Response for : %s' % (response.status_code, word)) else: for url in self.parse_app_urls(response.text): self._mongo_wrapper.insert_on_queue(url) parsed_urls.add(url) break # Response worked except requests.exceptions.SSLError as error: print 'SSL_Error : ' + error.errno # Paging Through Results while http_errors <= self._args['max_errors']: page_token = page_token_regex.search(response.text) if not page_token: self._logger.fatal("Couldn't find page token") break page_token = self.normalize_page_token(page_token.group()) post_data = self.assemble_word_search_post_data(page_token) try: response = requests.post(post_url, data=post_data, headers=HTTPUtils.headers, verify=self._verify_certificate, proxies=Utils.get_proxy()) if response.status_code != requests.codes.ok: http_errors += 1 #Utils.sleep(http_errors) self._logger.critical('Error [%d] on Response for : %s' % (response.status_code, word)) else: for url in self.parse_app_urls(response.text): if url in parsed_urls: return self._mongo_wrapper.insert_on_queue(url) parsed_urls.add(url) #Utils.sleep() except requests.exceptions.SSLError as error: print 'SSL_Error : ' + error.errno
def crawl_category(self, category): """ Executes a GET request for the url of the category received. Paginates through all the results found and store the unique urls into the MongoDB seed collection """ category_url = category[1] category_name = category[0] self._logger.info('Scraping links of Category : %s' % category_name) parsed_urls = set() http_errors = 0 while http_errors <= self._args['max_errors']: try: response = requests.get(category_url, HTTPUtils.headers, verify=self._verify_certificate, proxies=Utils.get_proxy()) if response.status_code != requests.codes.ok: http_errors += 1 #Utils.sleep(http_errors) self._logger.critical( 'Error [%d] on Response for : %s' % (response.status_code, category_name)) else: for url in self.parse_app_urls(response.text): self._mongo_wrapper.insert_on_queue(url) parsed_urls.add(url) break # Response worked except requests.exceptions.SSLError as error: print 'SSL_Error : ' + error.errno # Paging through results base_skip = 60 current_multiplier = 1 while http_errors <= self._args['max_errors']: post_data = self.assemble_category_post_data( current_multiplier, base_skip) try: response = requests.post(category_url + '?authuser=0', data=post_data, headers=HTTPUtils.headers, verify=self._verify_certificate, proxies=Utils.get_proxy()) if response.status_code != requests.codes.ok: http_errors += 1 #Utils.sleep(http_errors) self._logger.critical( 'Error [%d] on Response for : %s' % (response.status_code, category_name)) else: for url in self.parse_app_urls(response.text): if url in parsed_urls: return parsed_urls.add(url) self._mongo_wrapper.insert_on_queue(url) #Utils.sleep() except requests.exceptions.SSLError as error: print 'SSL_Error : ' + error.errno current_multiplier += 1
def crawl_by_search_word(self, word): """ Simulates an app search on the play store, using the word argument as the term to be searched. Paginates through all the results found and store the unique urls into the MongoDB seed collection """ self._logger.info('Scraping links of Word : %s' % word) parsed_urls = set() # Compiling regex used for parsing page token page_token_regex = regex.compile(r"GAEi+.+\:S\:.{11}\\42,") post_url = self.assemble_post_url(word) post_data = self.assemble_word_search_post_data() http_errors = 0 while http_errors <= self._args['max_errors']: try: response = requests.post(post_url, data=post_data, headers=HTTPUtils.headers, verify=self._verify_certificate, proxies=Utils.get_proxy()) if response.status_code != requests.codes.ok: http_errors+=1 #Utils.sleep(http_errors) self._logger.critical('Error [%d] on Response for : %s' % (response.status_code, word)) else: for url in self.parse_app_urls(response.text): self._mongo_wrapper.insert_on_queue(url) parsed_urls.add(url) break # Response worked except requests.exceptions.SSLError as error: print 'SSL_Error : ' + error.errno # Paging Through Results while http_errors <= self._args['max_errors']: page_token = page_token_regex.search(response.text) if not page_token: self._logger.fatal("Couldn't find page token") break page_token = self.normalize_page_token(page_token.group()) post_data = self.assemble_word_search_post_data(page_token) try: response = requests.post(post_url, data=post_data, headers=HTTPUtils.headers, verify=self._verify_certificate, proxies=Utils.get_proxy()) if response.status_code != requests.codes.ok: http_errors+=1 #Utils.sleep(http_errors) self._logger.critical('Error [%d] on Response for : %s' % (response.status_code, word)) else: for url in self.parse_app_urls(response.text): if url in parsed_urls: return self._mongo_wrapper.insert_on_queue(url) parsed_urls.add(url) #Utils.sleep() except requests.exceptions.SSLError as error: print 'SSL_Error : ' + error.errno
def crawl_category(self, category): """ Executes a GET request for the url of the category received. Paginates through all the results found and store the unique urls into the MongoDB seed collection """ category_url = category[1] category_name = category[0] self._logger.info('Scraping links of Category : %s' % category_name) parsed_urls = set() http_errors = 0 while http_errors <= self._args['max_errors']: try: response = requests.get(category_url, HTTPUtils.headers, verify=self._verify_certificate, proxies=Utils.get_proxy()) if response.status_code != requests.codes.ok: http_errors+=1 #Utils.sleep(http_errors) self._logger.critical('Error [%d] on Response for : %s' % (response.status_code, category_name)) else: for url in self.parse_app_urls(response.text): self._mongo_wrapper.insert_on_queue(url) parsed_urls.add(url) break # Response worked except requests.exceptions.SSLError as error: print 'SSL_Error : ' + error.errno # Paging through results base_skip = 60 current_multiplier = 1 while http_errors <= self._args['max_errors']: post_data = self.assemble_category_post_data(current_multiplier, base_skip) try: response = requests.post(category_url + '?authuser=0', data = post_data, headers=HTTPUtils.headers, verify=self._verify_certificate, proxies=Utils.get_proxy()) if response.status_code != requests.codes.ok: http_errors+=1 #Utils.sleep(http_errors) self._logger.critical('Error [%d] on Response for : %s' % (response.status_code, category_name)) else: for url in self.parse_app_urls(response.text): if url in parsed_urls: return parsed_urls.add(url) self._mongo_wrapper.insert_on_queue(url) #Utils.sleep() except requests.exceptions.SSLError as error: print 'SSL_Error : ' + error.errno current_multiplier+=1
def scrape_apps(self): """ Main method of the 'Worker' layer of this project. This method starts the distributed working phase which will consume urls from the seed database and scrape apps data out of the html pages, storing the result into the apps_data collection on MongoDB """ # Arguments Parsing args_parser = self.get_arguments_parser() self._args = vars(args_parser.parse_args()) # Log Handler Configuring self._logger = Utils.configure_log(self._args) # MongoDB Configuring if not Utils.configure_mongodb(self, **self._params): self._logger.fatal('Error configuring MongoDB') sys.exit(errno.ECONNREFUSED) # Making sure indexes exist self._mongo_wrapper.ensure_index('IsBusy') self._mongo_wrapper.ensure_index('_id', self._params['apps_collection']) # Proxies Loading self._proxies = Utils.load_proxies(self._args) # if "Debug Http" is set to true, "verify" must be "false" self._verify_certificate = not self._args['debug_https'] self._is_using_proxies = self._proxies != None # Control Variables - Used on the 'retrying logic' retries, max_retries = 0, 8 parser = html_parser() # Loop only breaks when there are no more apps to be processed while True: # Finds an app to be processed and toggles it's state to 'Busy' seed_record = self._mongo_wrapper.find_and_modify() if not seed_record: break try: url = seed_record['_id'] # Do we need to normalize the url ? if 'http://' not in url and 'https://' not in url: url = 'https://play.google.com' + url self._logger.info('Processing: %s' % url) # Is this app processed already ? if self._mongo_wrapper.app_processed( url, self._params['apps_collection']): self._logger.info('Duplicated App : %s. Skipped' % url) self._mongo_wrapper.remove_app_from_queue(seed_record) continue # Get Request for the App's Page response = requests.get(url, HTTPUtils.headers, verify=self._verify_certificate, proxies=Utils.get_proxy(self)) # Sanity Checks on Response if not response.text or response.status_code != requests.codes.ok: self._logger.info('Error Opening App Page : %s' % url) retries += 1 # Retries logic are different if proxies are being used if self._is_using_proxies: Utils.sleep() try: # Scraping Data from HTML app = parser.parse_app_data(response.text) # Stamping URL into app model app['Url'] = url app['_id'] = url # Reaching related apps related_apps = parser.parse_related_apps(response.text) if not related_apps: app['RelatedUrls'] = None else: app['RelatedUrls'] = related_apps self._logger.info('Related Apps: %s - %d' % (url, len(related_apps))) # Inserting data into MongoDB self._mongo_wrapper._insert( app, self._params['apps_collection']) # Re-Feeding seed collection with related-app urls if app['RelatedUrls']: for url in app['RelatedUrls']: if not self._mongo_wrapper.app_processed(url, self._params['apps_collection']) and \ not self._mongo_wrapper.app_processed(url, self._params['seed_collection']): self._mongo_wrapper.insert_on_queue( url, self._params['seed_collection']) except Exception as exception: self._logger.error(exception) # Toggling app state back to false self._mongo_wrapper.toggle_app_busy( url, False, self._params['seed_collection']) except Exception as exception: self._logger.error(exception)