コード例 #1
0
ファイル: wsgi.py プロジェクト: samalba/image-spider
def application(env, start_response):

    """
    Main application for uwsgi to run.

    Arguments:
        env: array of server environmental variables.
        start_response: uwsgi function to start HTTP response.

    Returns: None
    """
    method = env['REQUEST_METHOD'].lower()
    controller_name = env['PATH_INFO'][1:] or 'crawl'

    # We accept two forms of query-strings. The first form is key-value pairs.
    # When using the first form, we only consider the first provided value for
    # any given key. The second form is an integer that implicitly designates
    # a job_id value.
    #
    # In either case, the query dict delivered to every controller has
    # a guaranteed value assignment, even if that value is None. No other query
    # variables are guaranteed.
    #
    querystring = parse_qs(env['QUERY_STRING']) or env['QUERY_STRING'] or None
    try:
        query = {k:v[0] for k,v in querystring.items()}
    except AttributeError:
        query = {'job_id': querystring}
    try:
        query['job_id'] = int(query['job_id'])
    except (KeyError, TypeError, ValueError):
        query['job_id'] = None

    # If the controller is registered in controllers/__init__.py, and it offers
    # a method that corresponds with the HTTP method in use, then request a
    # response from it. Otherwise our response will be an HTTP error.
    if controller_name in controllers.__all__:
        controller = getattr(controllers, controller_name)()
        request = getattr(controller, method, None)
        if request:
            if 'post' == method:
                try:
                    length = int(env['CONTENT_LENGTH'])
                except KeyError:
                    response = http_error('411 Length Required')
                    return response(start_response)
                postdata = parse_qs(env['wsgi.input'].read(length).decode())

                response = request(query, postdata)
            else:
                response = request(query) if query else request()
        else:
            response = http_error('405 Method Not Allowed')
    else:
        response = http_error('404 Not Found')

    # Return our HTTP response to uwsgi.
    return response(start_response)
コード例 #2
0
ファイル: wsgi.py プロジェクト: samalba/image-spider
def application(env, start_response):
    """
    Main application for uwsgi to run.

    Arguments:
        env: array of server environmental variables.
        start_response: uwsgi function to start HTTP response.

    Returns: None
    """
    method = env['REQUEST_METHOD'].lower()
    controller_name = env['PATH_INFO'][1:] or 'crawl'

    # We accept two forms of query-strings. The first form is key-value pairs.
    # When using the first form, we only consider the first provided value for
    # any given key. The second form is an integer that implicitly designates
    # a job_id value.
    #
    # In either case, the query dict delivered to every controller has
    # a guaranteed value assignment, even if that value is None. No other query
    # variables are guaranteed.
    #
    querystring = parse_qs(env['QUERY_STRING']) or env['QUERY_STRING'] or None
    try:
        query = {k: v[0] for k, v in querystring.items()}
    except AttributeError:
        query = {'job_id': querystring}
    try:
        query['job_id'] = int(query['job_id'])
    except (KeyError, TypeError, ValueError):
        query['job_id'] = None

    # If the controller is registered in controllers/__init__.py, and it offers
    # a method that corresponds with the HTTP method in use, then request a
    # response from it. Otherwise our response will be an HTTP error.
    if controller_name in controllers.__all__:
        controller = getattr(controllers, controller_name)()
        request = getattr(controller, method, None)
        if request:
            if 'post' == method:
                try:
                    length = int(env['CONTENT_LENGTH'])
                except KeyError:
                    response = http_error('411 Length Required')
                    return response(start_response)
                postdata = parse_qs(env['wsgi.input'].read(length).decode())

                response = request(query, postdata)
            else:
                response = request(query) if query else request()
        else:
            response = http_error('405 Method Not Allowed')
    else:
        response = http_error('404 Not Found')

    # Return our HTTP response to uwsgi.
    return response(start_response)
コード例 #3
0
    def delete(self, query=None):
        """
        Delete the specified URL, all related images, and all crawled children
        of that URL from the datastores.

        Arguments:
            query values:
                url: string URL.

        Returns: HTTP 204 or 404.
        """

        if not 'url' in query:
            return http_error('400 Bad Request')

        if self.webpages_model.delete(query['url']):
            return responder(None, None, '204 No Content')
        else:
            return http_error('404 Not Found')
コード例 #4
0
ファイル: result.py プロジェクト: samalba/image-spider
    def delete(self, query=None):

        """
        Delete the specified URL, all related images, and all crawled children
        of that URL from the datastores.

        Arguments:
            query values:
                url: string URL.

        Returns: HTTP 204 or 404.
        """

        if not 'url' in query:
            return http_error('400 Bad Request')

        if self.webpages_model.delete(query['url']):
            return responder(None, None, '204 No Content')
        else:
            return http_error('404 Not Found')
コード例 #5
0
    def get(self, query=None):

        """
        Get the status of crawling a given URL.

        Arguments:
            query: Integer job_id, job_id=<JOB_ID> assignment, or
                   url=<URL> assignment.
        Returns: JSON spider status
        """

        url = query['url'] if 'url' in query else None
        job_id = query['job_id'] if 'job_id' in query else None
        job_id_specified = int == type(job_id)
        webpage_id = None
        if url:
            webpage_id = self.webpages_model.get_webpage_info(url)['id']

        if not url and not job_id_specified:
            return http_error('400 Bad Request')

        if job_id_specified and not self.jobs_model.job_exists(job_id):
            return http_error('404 Not Found')
        elif url and not webpage_id:
            return http_error('404 Not Found')

        if job_id_specified:
            urls = json.dumps(self.jobs_model.get_init_urls(job_id))
            job_status = json.dumps(self.jobs_model.get_status(job_id))
        else:
            urls = json.dumps([url])
            get_status = self.jobs_model.get_status
            job_ids = self.webpages_model.get_job_ids(url)
            job_status_list = [get_status(job_id) for job_id in job_ids]
            job_status_list = [status for status in job_status_list if status]
            job_status = json.dumps(job_status_list)

        status_view = view('status.json', {'urls': urls,
                                           'job_status': job_status})
        return responder(status_view)
コード例 #6
0
ファイル: crawl.py プロジェクト: samalba/image-spider
    def post(self, query, postdata):

        """
        Posting to crawl (AKA /) requests spider(s) to crawl each of the
        specified webpages.

        Arguments:
            query: dict having optional depth=n, where the default is 2.
            postdata: form-urlencoded string must contain newline-separated URLs
                      assigned to a 'urls' variable.

        Returns: HTTP 202 Accepted or 400 Bad Request.
        """

        if 'urls' in postdata:
            urls = postdata['urls'][0].splitlines()
        else:
            return http_error('400 Bad Request')

        try:
            depth = int(query['depth'])
        except KeyError:
            depth = 2

        # Register all URLs with this job even if their results are cached.
        # This allows jobs to be stopped and resumed.
        self.webpages_model.register_job(self.job_id, urls)

        # Iterate through a copy of urls, since items may be removed from it.
        for url in urls[:]:
            status = self.webpages_model.get_status(url)
            webpage_info = self.webpages_model.get_webpage_info(url)

            if 'processing' == status and depth > webpage_info['depth']:
                self.spiders_model.stop(url)

            elif webpage_info['completion_datetime']:
                # Ignore webpages with good depth crawled less than 15 min ago.
                now = datetime.datetime.now()
                td = now - webpage_info['completion_datetime']
                if 900 > td.total_seconds() and depth <= webpage_info['depth']:
                    urls.remove(url)

        self.webpages_model.add(urls, depth=depth)
        self.spiders_model.deploy(self.job_id)

        crawl_view = view('crawl.json', {'job_id': self.job_id})
        return responder(crawl_view, 'application/json', '202 Accepted')
コード例 #7
0
    def post(self, query, postdata):
        """
        Send an abort-crawl request.

        Arguments:

            query: dict query having the following parameter:
                job_id: integer Job ID.

            postdata: Ignored.

        Returns: None
        """

        job_id = query['job_id']
        if self.jobs_model.job_exists(job_id):
            self.spiders_model.stop(job_id)
            return responder(None, None, '202 Accepted')
        else:
            return http_error('404 Not Found')
コード例 #8
0
ファイル: stop.py プロジェクト: samalba/image-spider
    def post(self, query, postdata):

        """
        Send an abort-crawl request.

        Arguments:

            query: dict query having the following parameter:
                job_id: integer Job ID.

            postdata: Ignored.

        Returns: None
        """

        job_id = query["job_id"]
        if self.jobs_model.job_exists(job_id):
            self.spiders_model.stop(job_id)
            return responder(None, None, "202 Accepted")
        else:
            return http_error("404 Not Found")
コード例 #9
0
    def get(self, query=None):
        """
        Get a list of result images from a given web crawl.

        Arguments:
            query values:
                job_id: integer job id.

        Returns: JSON list of URLs referencing found image files.
        """

        if not 'job_id' in query and not 'url' in query:
            return http_error('400 Bad Request')

        if int == type(query['job_id']):
            images = self.images_model.get_by_job_id(query['job_id'])
        else:
            images = self.images_model.get_by_url(query['url'])

        result_view = view('result.json', {'images': json.dumps(images)})
        return responder(result_view, 'application/json')
コード例 #10
0
ファイル: result.py プロジェクト: samalba/image-spider
    def get(self, query=None):

        """
        Get a list of result images from a given web crawl.

        Arguments:
            query values:
                job_id: integer job id.

        Returns: JSON list of URLs referencing found image files.
        """

        if not 'job_id' in query and not 'url' in query:
            return http_error('400 Bad Request')

        if int == type(query['job_id']):
            images = self.images_model.get_by_job_id(query['job_id'])
        else:
            images = self.images_model.get_by_url(query['url'])

        result_view = view('result.json', {'images': json.dumps(images)})
        return responder(result_view, 'application/json')