def application(env, start_response): """ Main application for uwsgi to run. Arguments: env: array of server environmental variables. start_response: uwsgi function to start HTTP response. Returns: None """ method = env['REQUEST_METHOD'].lower() controller_name = env['PATH_INFO'][1:] or 'crawl' # We accept two forms of query-strings. The first form is key-value pairs. # When using the first form, we only consider the first provided value for # any given key. The second form is an integer that implicitly designates # a job_id value. # # In either case, the query dict delivered to every controller has # a guaranteed value assignment, even if that value is None. No other query # variables are guaranteed. # querystring = parse_qs(env['QUERY_STRING']) or env['QUERY_STRING'] or None try: query = {k:v[0] for k,v in querystring.items()} except AttributeError: query = {'job_id': querystring} try: query['job_id'] = int(query['job_id']) except (KeyError, TypeError, ValueError): query['job_id'] = None # If the controller is registered in controllers/__init__.py, and it offers # a method that corresponds with the HTTP method in use, then request a # response from it. Otherwise our response will be an HTTP error. if controller_name in controllers.__all__: controller = getattr(controllers, controller_name)() request = getattr(controller, method, None) if request: if 'post' == method: try: length = int(env['CONTENT_LENGTH']) except KeyError: response = http_error('411 Length Required') return response(start_response) postdata = parse_qs(env['wsgi.input'].read(length).decode()) response = request(query, postdata) else: response = request(query) if query else request() else: response = http_error('405 Method Not Allowed') else: response = http_error('404 Not Found') # Return our HTTP response to uwsgi. return response(start_response)
def application(env, start_response): """ Main application for uwsgi to run. Arguments: env: array of server environmental variables. start_response: uwsgi function to start HTTP response. Returns: None """ method = env['REQUEST_METHOD'].lower() controller_name = env['PATH_INFO'][1:] or 'crawl' # We accept two forms of query-strings. The first form is key-value pairs. # When using the first form, we only consider the first provided value for # any given key. The second form is an integer that implicitly designates # a job_id value. # # In either case, the query dict delivered to every controller has # a guaranteed value assignment, even if that value is None. No other query # variables are guaranteed. # querystring = parse_qs(env['QUERY_STRING']) or env['QUERY_STRING'] or None try: query = {k: v[0] for k, v in querystring.items()} except AttributeError: query = {'job_id': querystring} try: query['job_id'] = int(query['job_id']) except (KeyError, TypeError, ValueError): query['job_id'] = None # If the controller is registered in controllers/__init__.py, and it offers # a method that corresponds with the HTTP method in use, then request a # response from it. Otherwise our response will be an HTTP error. if controller_name in controllers.__all__: controller = getattr(controllers, controller_name)() request = getattr(controller, method, None) if request: if 'post' == method: try: length = int(env['CONTENT_LENGTH']) except KeyError: response = http_error('411 Length Required') return response(start_response) postdata = parse_qs(env['wsgi.input'].read(length).decode()) response = request(query, postdata) else: response = request(query) if query else request() else: response = http_error('405 Method Not Allowed') else: response = http_error('404 Not Found') # Return our HTTP response to uwsgi. return response(start_response)
def delete(self, query=None): """ Delete the specified URL, all related images, and all crawled children of that URL from the datastores. Arguments: query values: url: string URL. Returns: HTTP 204 or 404. """ if not 'url' in query: return http_error('400 Bad Request') if self.webpages_model.delete(query['url']): return responder(None, None, '204 No Content') else: return http_error('404 Not Found')
def get(self, query=None): """ Get the status of crawling a given URL. Arguments: query: Integer job_id, job_id=<JOB_ID> assignment, or url=<URL> assignment. Returns: JSON spider status """ url = query['url'] if 'url' in query else None job_id = query['job_id'] if 'job_id' in query else None job_id_specified = int == type(job_id) webpage_id = None if url: webpage_id = self.webpages_model.get_webpage_info(url)['id'] if not url and not job_id_specified: return http_error('400 Bad Request') if job_id_specified and not self.jobs_model.job_exists(job_id): return http_error('404 Not Found') elif url and not webpage_id: return http_error('404 Not Found') if job_id_specified: urls = json.dumps(self.jobs_model.get_init_urls(job_id)) job_status = json.dumps(self.jobs_model.get_status(job_id)) else: urls = json.dumps([url]) get_status = self.jobs_model.get_status job_ids = self.webpages_model.get_job_ids(url) job_status_list = [get_status(job_id) for job_id in job_ids] job_status_list = [status for status in job_status_list if status] job_status = json.dumps(job_status_list) status_view = view('status.json', {'urls': urls, 'job_status': job_status}) return responder(status_view)
def post(self, query, postdata): """ Posting to crawl (AKA /) requests spider(s) to crawl each of the specified webpages. Arguments: query: dict having optional depth=n, where the default is 2. postdata: form-urlencoded string must contain newline-separated URLs assigned to a 'urls' variable. Returns: HTTP 202 Accepted or 400 Bad Request. """ if 'urls' in postdata: urls = postdata['urls'][0].splitlines() else: return http_error('400 Bad Request') try: depth = int(query['depth']) except KeyError: depth = 2 # Register all URLs with this job even if their results are cached. # This allows jobs to be stopped and resumed. self.webpages_model.register_job(self.job_id, urls) # Iterate through a copy of urls, since items may be removed from it. for url in urls[:]: status = self.webpages_model.get_status(url) webpage_info = self.webpages_model.get_webpage_info(url) if 'processing' == status and depth > webpage_info['depth']: self.spiders_model.stop(url) elif webpage_info['completion_datetime']: # Ignore webpages with good depth crawled less than 15 min ago. now = datetime.datetime.now() td = now - webpage_info['completion_datetime'] if 900 > td.total_seconds() and depth <= webpage_info['depth']: urls.remove(url) self.webpages_model.add(urls, depth=depth) self.spiders_model.deploy(self.job_id) crawl_view = view('crawl.json', {'job_id': self.job_id}) return responder(crawl_view, 'application/json', '202 Accepted')
def post(self, query, postdata): """ Send an abort-crawl request. Arguments: query: dict query having the following parameter: job_id: integer Job ID. postdata: Ignored. Returns: None """ job_id = query['job_id'] if self.jobs_model.job_exists(job_id): self.spiders_model.stop(job_id) return responder(None, None, '202 Accepted') else: return http_error('404 Not Found')
def post(self, query, postdata): """ Send an abort-crawl request. Arguments: query: dict query having the following parameter: job_id: integer Job ID. postdata: Ignored. Returns: None """ job_id = query["job_id"] if self.jobs_model.job_exists(job_id): self.spiders_model.stop(job_id) return responder(None, None, "202 Accepted") else: return http_error("404 Not Found")
def get(self, query=None): """ Get a list of result images from a given web crawl. Arguments: query values: job_id: integer job id. Returns: JSON list of URLs referencing found image files. """ if not 'job_id' in query and not 'url' in query: return http_error('400 Bad Request') if int == type(query['job_id']): images = self.images_model.get_by_job_id(query['job_id']) else: images = self.images_model.get_by_url(query['url']) result_view = view('result.json', {'images': json.dumps(images)}) return responder(result_view, 'application/json')