Example #1
0
def stream_atom(request):
    params = dict(request.params)

    # The maximum value that this function allows the limit param.
    default_limit = 100
    max_limit = 500

    try:
        params["limit"] = int(params.get("limit", default_limit))
    except (ValueError, TypeError):
        params["limit"] = default_limit

    if params["limit"] < 0:
        params["limit"] = default_limit
    if params["limit"] > max_limit:
        params["limit"] = max_limit

    try:
        annotations = request.api_client.get(
            "/search", params=params)["rows"]
    except api_client.ConnectionError as err:
        raise httpexceptions.HTTPServiceUnavailable(err)
    except api_client.Timeout as err:
        raise httpexceptions.HTTPGatewayTimeout(err)
    except api_client.APIError as err:
        raise httpexceptions.HTTPBadGateway(err)

    return dict(
        annotations=annotations,
        atom_url=request.route_url("stream_atom"),
        html_url=request.route_url("stream"),
        title=request.registry.settings.get("h.feed.title"),
        subtitle=request.registry.settings.get("h.feed.subtitle"))
Example #2
0
    def test_when_scrapyd_fails_then_it_should_not_be_operational(self):
        with mock.patch(
                'web_runner.scrapyd.requests.get') as mock_requests_get:
            alive_response = mock.MagicMock()
            alive_response.status_code = 200

            mock_requests_get.side_effect = [
                alive_response,
                exc.HTTPBadGateway(detail="Test"),
            ]

            status = self.subject.get_operational_status()

            self.assertEqual(
                {
                    'scrapyd_alive': True,
                    'scrapyd_operational': False,
                    'scrapyd_projects': None,
                    'spiders': None,
                    'queues': None,
                    'summarized_queue': None,
                },
                status,
            )

            mock_requests_get.assert_any_call(self.URL)
            mock_requests_get.assert_called_with(
                self.EXPECTED_LIST_PROJECTS_URL)
Example #3
0
def last_request_status(request):
    """Returns the last requests requested.

    The request accepts an optional parameter size, which is the maximum number
    of items returned.
    """
    settings = request.registry.settings

    default_size = 10
    size_str = request.params.get('size', default_size)
    try:
        size = int(size_str)
    except ValueError:
        raise exc.HTTPBadGateway(detail="Size parameter has incorrect value")

    # Get last requests
    dbinterf = web_runner.db.DbInterface(settings['db_filename'],
                                         recreate=False)
    reqs = dbinterf.get_last_requests(size)
    dbinterf.close()

    # Get the jobid status dictionary.
    scrapyd_baseurl = settings[SCRAPYD_BASE_URL_KEY]
    scrapyd_interf = Scrapyd(scrapyd_baseurl)
    jobids_status = scrapyd_interf.get_jobs()

    # For each request, determine the request status gathering
    # the information from all jobids related to it
    for req in reqs:
        req['status'] = get_request_status(req, jobids_status)

    return reqs
Example #4
0
def command_result(request):
    """Report result of job."""
    name = request.matchdict['name']
    encoded_job_ids = request.matchdict['jobid']
    try:
        job_ids = decode_ids(encoded_job_ids)
    except TypeError:
        # Malformed Job ID.
        raise exc.HTTPBadRequest("The job ID is invalid.")

    settings = request.registry.settings
    cfg_template = find_command_config_from_name(settings, name)

    spider_cfgs = starmap(
        render_spider_config,
        zip(
            cfg_template.spider_configs,
            cfg_template.spider_params,
            repeat(request.params),
        ))

    # Storing the request in the internal DB
    dbinterf = web_runner.db.DbInterface(settings['db_filename'],
                                         recreate=False)
    dbinterf.new_request_event(web_runner.db.COMMAND_RESULT, job_ids,
                               request.remote_addr)
    dbinterf.close()

    scrapyd = Scrapyd(settings[SCRAPYD_BASE_URL_KEY])

    args = dict(request.params)
    for i, (job_id, spider_cfg) in enumerate(zip(job_ids, spider_cfgs)):
        fn = ScrapydJobHelper(settings, spider_cfg,
                              scrapyd).retrieve_job_data_fn(job_id)
        args['spider %d' % i] = fn

    cmd_line = cfg_template.cmd.format(**args)
    LOG.info("Starting command: %s", cmd_line)
    process = subprocess.Popen(
        cmd_line,
        stdout=subprocess.PIPE,
        stderr=subprocess.PIPE,
        shell=True,
    )

    LOG.info("Waiting until conn timeout for command to finish...")
    stdout, stderr = process.communicate()
    LOG.info("Process finished.")

    if process.returncode != 0:
        msg = "The command terminated with an return value of %s." \
              " Process' standard error: %s" \
              % (process.returncode, stderr)
        LOG.warn(msg)
        raise exc.HTTPBadGateway(detail=msg)

    LOG.info("Command generated %s bytes.", len(stdout))
    request.response.content_type = cfg_template.content_type
    request.response.body = stdout
    return request.response
Example #5
0
 def _make_uncached_request(url):
     try:
         response = requests.get(url)
         LOG.debug(
             "Requested from scrapyd resource %s and got: %s",
             url,
             response.content,
         )
         return response.json()
     except requests.exceptions.RequestException as e:
         msg = "Error contacting Scrapyd: %s" % e
         LOG.error(msg)
         raise exc.HTTPBadGateway(msg)
Example #6
0
def stream_atom(request):
    try:
        annotations = request.api_client.get("/search", params={"limit":
                                                                1000})["rows"]
    except api_client.ConnectionError as err:
        raise httpexceptions.HTTPServiceUnavailable(err)
    except api_client.Timeout as err:
        raise httpexceptions.HTTPGatewayTimeout(err)
    except api_client.APIError as err:
        raise httpexceptions.HTTPBadGateway(err)

    return dict(annotations=annotations,
                atom_url=request.route_url("stream_atom"),
                html_url=request.route_url("stream"),
                title=request.registry.settings.get("h.feed.title"),
                subtitle=request.registry.settings.get("h.feed.subtitle"))
Example #7
0
    def _make_request(self, resource, fresh=False, cache_time=None, **query):
        """Makes a request to the configured Scrapyd instance for the resource
        passing the given query string.

        :param resource: The resource to request.
        :type resource: unicode
        :param fresh: Whether to invalidate the cache.
        :type fresh: bool
        :param cache_time: For how many seconds a fresh response would be valid.
        :type cache_time: int
        :param query: The query string parameters.
        :return: The structure from the decoded JSON.
        """
        url = urlparse.urljoin(self.scrapyd_url, resource)
        if query:
            url += '?' + urllib.urlencode(query)

        if fresh:
            LOG.debug("Invalidated cache for %r.", url)
            Scrapyd._CACHE.invalidate(url)
            result = None
        else:
            result = Scrapyd._CACHE.get(url)

        if result is not None:
            LOG.debug("Cache hit for %r.", url)
        else:
            LOG.debug("Cache miss for %r.", url)
            # Will get exclusive access to the cache.
            with Scrapyd._CACHE_LOCK:
                # Before we got access, it may have been populated.
                result = Scrapyd._CACHE.get(url)
                if result is not None:
                    LOG.debug("Cache hit after locking for %r.", url)
                else:
                    result = Scrapyd._make_uncached_request(url)

                    Scrapyd._CACHE.put(url, result, timeout=cache_time)

        # Check result response is successful.
        if result['status'].lower() != "ok":
            LOG.error("Scrapyd was not OK: %r", result)
            raise exc.HTTPBadGateway(
                "Scrapyd was not OK, it was '{status}': {message}".format(
                    **result))

        return result
Example #8
0
def spider_results_view(request):
    settings = request.registry.settings

    project_name = request.matchdict['project']
    spider_name = request.matchdict['spider']
    job_id = request.matchdict['jobid']

    # Storing the request in the internal DB
    dbinterf = web_runner.db.DbInterface(settings['db_filename'],
                                         recreate=False)
    dbinterf.new_request_event(web_runner.db.SPIDER_RESULT, (job_id, ),
                               request.remote_addr)
    dbinterf.close()

    scrapyd = Scrapyd(settings[SCRAPYD_BASE_URL_KEY])
    try:
        data_stream = ScrapydJobHelper(settings,
                                       SpiderConfig(spider_name, project_name),
                                       scrapyd).retrieve_job_data(job_id)
        request.response.body_file = data_stream
        return request.response
    except ScrapydJobException as e:
        raise exc.HTTPBadGateway(
            detail="The content could not be retrieved: %s" % e)
Example #9
0
def request_history(request):
    """Returns the history of a request

    The view expects to receive a requestid.
    The view returns a dictionary with the following keys:
     * request: dictionary with main request infomation stored in the DB
     * jobids_info: dictionary whose key are all jobids related to
        requestid. The values is a dictionary with jobid information.
     * history: List with history content.
     * status: String with the requestid status

    Example of request:
        {'creation': u'2014-07-30 19:38:53.659982', 
         'params': u'{"searchterms_str": "laundry detergent", "group_name": "Gabo test1", "site": "walmart", "quantity": "100"}', 
         'requestid': 252, 
         'jobids': (u'236c257c182111e4906150465d4bc079',), 
         'remote_ip': u'127.0.0.1', 
         'group_name': u'Gabo test1', 
         'type': u'command', 
         'site': u'walmart', 
         'name': u'cat1'}

    Example of jobids_info:
        {u'17ae4f1c182111e4906150465d4bc079': {
            'spider': u'walmart_products', 
            'status': 'finished', 
            'start_time': u'2014-07-30 16:38:34.218200', 
            'end_time': u'2014-07-30 16:40:50.766396', 
            'id': u'17ae4f1c182111e4906150465d4bc079'}, 
         u'236c257c182111e4906150465d4bc079': {
            'spider': u'walmart_products', 
            'status': 'finished', 
            'start_time': '2014-07-30 16:38:54.116999', 
            'end_time': u'2014-07-30 16:41:06.851201', 
            'id': u'236c257c182111e4906150465d4bc079'}}

    Exanmple of history:
        [["2014-07-30 21:13:02.829964", "1 hour", "Request arrived from 127.0.0.1."],
        ["2014-07-30 21:16:02.829964", "1 hour", "Request Finished"]]
    """
    settings = request.registry.settings

    try:
        requestid = int(request.matchdict['requestid'])
    except ValueError:
        raise exc.HTTPBadGateway(detail="Request id is not valid")

    # Get request info
    dbinterf = web_runner.db.DbInterface(settings['db_filename'],
                                         recreate=False)
    request_info = dbinterf.get_request(requestid)
    operations_info = dbinterf.get_req_operations(requestid)
    dbinterf.close()

    if not request_info:
        # The requestid is not recognized
        raise exc.HTTPBadGateway(detail="No info from Request id")

    # Get the jobid status dictionary.
    scrapyd_baseurl = settings[SCRAPYD_BASE_URL_KEY]
    scrapyd_interf = Scrapyd(scrapyd_baseurl)
    jobids_status = scrapyd_interf.get_jobs()

    try:
        # Get only the jobids of the current request.
        jobids_info = {
            jobid: jobids_status[jobid]
            for jobid in request_info['jobids']
        }
    except KeyError:
        jobids_info = None

    if jobids_info:
        history = _get_history(requestid, request_info, jobids_info,
                               operations_info)
        status = get_request_status(request_info, jobids_status)
    else:
        history = None
        status = UNAVAILABLE

    info = {
        'request': request_info,
        'jobids_info': jobids_info,
        'history': history,
        'status': status,
    }
    return info
Example #10
0
def command_start_view(request):
    """Schedules running a command plus spiders."""
    settings = request.registry.settings
    cfg_template = find_command_config_from_path(settings, request.path)

    spider_cfgs = starmap(
        render_spider_config,
        zip(
            cfg_template.spider_configs,
            cfg_template.spider_params,
            repeat(request.params),
        ))

    scrapyd = Scrapyd(settings[SCRAPYD_BASE_URL_KEY])

    spider_job_ids = []
    try:
        for spider_cfg, spider_params in zip(spider_cfgs,
                                             cfg_template.spider_params):
            all_params = dict(spider_params)
            all_params.update(request.params)

            jobid = ScrapydJobHelper(settings, spider_cfg,
                                     scrapyd).start_job(all_params)
            spider_job_ids.append(jobid)
            LOG.info(
                "For command at '%s', started crawl job with id '%s'.",
                cfg_template.name,
                jobid,
            )
    except ScrapydJobStartError as e:
        raise exc.HTTPBadGateway(
            "Failed to start a required crawl for command '{}'."
            " Scrapyd was not OK, it was '{}': {}".format(
                cfg_template.name, e.status, e.message))
    except ScrapydJobException as e:
        raise exc.HTTPBadGateway(
            "For command {}, unexpected error when contacting Scrapyd:"
            " {}".format(cfg_template.name, e.message))

    command_name = request.path.strip('/')
    id = request.route_path(
        "command pending jobs",
        name=cfg_template.name,
        jobid=encode_ids(spider_job_ids),
        _query=request.params,
    )

    # Storing the request in the internal DB
    dbinterf = web_runner.db.DbInterface(settings['db_filename'],
                                         recreate=False)
    dbinterf.new_command(
        command_name,
        dict(request.params),
        spider_job_ids,
        request.remote_addr,
        id=id,
    )
    dbinterf.close()

    raise exc.HTTPFound(location=id,
                        detail="Command '{}' started with {} crawls.".format(
                            cfg_template.name, len(spider_job_ids)))