def scrape_apps(self): """ Main method of the 'Worker' layer of this project. This method starts the distributed working phase which will consume urls from the seed database and scrape apps data out of the html pages, storing the result into the apps_data collection on MongoDB """ # Arguments Parsing args_parser = self.get_arguments_parser() self._args = vars(args_parser.parse_args()) # Log Handler Configuring self._logger = Utils.configure_log(self._args) # MongoDB Configuring if not Utils.configure_mongodb(self,**self._params): self._logger.fatal('Error configuring MongoDB') sys.exit(errno.ECONNREFUSED) # Making sure indexes exist self._mongo_wrapper.ensure_index('IsBusy'); self._mongo_wrapper.ensure_index('_id', self._params['apps_collection']) # Proxies Loading self._proxies = Utils.load_proxies(self._args) # if "Debug Http" is set to true, "verify" must be "false" self._verify_certificate = not self._args['debug_https'] self._is_using_proxies = self._proxies != None # Control Variables - Used on the 'retrying logic' retries, max_retries = 0, 8 parser = html_parser() # Loop only breaks when there are no more apps to be processed while True: # Finds an app to be processed and toggles it's state to 'Busy' seed_record = self._mongo_wrapper.find_and_modify() if not seed_record: break try: url = seed_record['_id'] # Do we need to normalize the url ? if 'http://' not in url and 'https://' not in url: url = 'https://play.google.com' + url self._logger.info('Processing: %s' % url) # Is this app processed already ? if self._mongo_wrapper.app_processed(url, self._params['apps_collection']): self._logger.info('Duplicated App : %s. Skipped' % url) self._mongo_wrapper.remove_app_from_queue(seed_record) continue # Get Request for the App's Page response = requests.get(url, HTTPUtils.headers, verify=self._verify_certificate, proxies=Utils.get_proxy(self)) # Sanity Checks on Response if not response.text or response.status_code != requests.codes.ok: self._logger.info('Error Opening App Page : %s' % url) retries += 1 # Retries logic are different if proxies are being used if self._is_using_proxies: Utils.sleep() try: # Scraping Data from HTML app = parser.parse_app_data(response.text) # Stamping URL into app model app['Url'] = url app['_id'] = url # Reaching related apps related_apps = parser.parse_related_apps(response.text) if not related_apps: app['RelatedUrls'] = None else: app['RelatedUrls'] = related_apps self._logger.info('Related Apps: %s - %d' % (url, len(related_apps))) # Inserting data into MongoDB self._mongo_wrapper._insert(app, self._params['apps_collection']) # Re-Feeding seed collection with related-app urls if app['RelatedUrls']: for url in app['RelatedUrls']: if not self._mongo_wrapper.app_processed(url, self._params['apps_collection']) and \ not self._mongo_wrapper.app_processed(url, self._params['seed_collection']): self._mongo_wrapper.insert_on_queue(url, self._params['seed_collection']) except Exception as exception: self._logger.error(exception) # Toggling app state back to false self._mongo_wrapper.toggle_app_busy(url,False, self._params['seed_collection']) except Exception as exception: self._logger.error(exception)
def scrape_apps(self): """ Main method of the 'Worker' layer of this project. This method starts the distributed working phase which will consume urls from the seed database and scrape apps data out of the html pages, storing the result into the apps_data collection on MongoDB """ # Arguments Parsing args_parser = self.get_arguments_parser() self._args = vars(args_parser.parse_args()) # Log Handler Configuring self._logger = Utils.configure_log(self._args) # MongoDB Configuring if not Utils.configure_mongodb(self, **self._params): self._logger.fatal('Error configuring MongoDB') sys.exit(errno.ECONNREFUSED) # Making sure indexes exist self._mongo_wrapper.ensure_index('IsBusy') self._mongo_wrapper.ensure_index('_id', self._params['apps_collection']) # Proxies Loading self._proxies = Utils.load_proxies(self._args) # if "Debug Http" is set to true, "verify" must be "false" self._verify_certificate = not self._args['debug_https'] self._is_using_proxies = self._proxies != None # Control Variables - Used on the 'retrying logic' retries, max_retries = 0, 8 parser = html_parser() # Loop only breaks when there are no more apps to be processed while True: # Finds an app to be processed and toggles it's state to 'Busy' seed_record = self._mongo_wrapper.find_and_modify() if not seed_record: break try: url = seed_record['_id'] # Do we need to normalize the url ? if 'http://' not in url and 'https://' not in url: url = 'https://play.google.com' + url self._logger.info('Processing: %s' % url) # Is this app processed already ? if self._mongo_wrapper.app_processed( url, self._params['apps_collection']): self._logger.info('Duplicated App : %s. Skipped' % url) self._mongo_wrapper.remove_app_from_queue(seed_record) continue # Get Request for the App's Page response = requests.get(url, HTTPUtils.headers, verify=self._verify_certificate, proxies=Utils.get_proxy(self)) # Sanity Checks on Response if not response.text or response.status_code != requests.codes.ok: self._logger.info('Error Opening App Page : %s' % url) retries += 1 # Retries logic are different if proxies are being used if self._is_using_proxies: Utils.sleep() try: # Scraping Data from HTML app = parser.parse_app_data(response.text) # Stamping URL into app model app['Url'] = url app['_id'] = url # Reaching related apps related_apps = parser.parse_related_apps(response.text) if not related_apps: app['RelatedUrls'] = None else: app['RelatedUrls'] = related_apps self._logger.info('Related Apps: %s - %d' % (url, len(related_apps))) # Inserting data into MongoDB self._mongo_wrapper._insert( app, self._params['apps_collection']) # Re-Feeding seed collection with related-app urls if app['RelatedUrls']: for url in app['RelatedUrls']: if not self._mongo_wrapper.app_processed(url, self._params['apps_collection']) and \ not self._mongo_wrapper.app_processed(url, self._params['seed_collection']): self._mongo_wrapper.insert_on_queue( url, self._params['seed_collection']) except Exception as exception: self._logger.error(exception) # Toggling app state back to false self._mongo_wrapper.toggle_app_busy( url, False, self._params['seed_collection']) except Exception as exception: self._logger.error(exception)