def _uri_from_curi(self, curi): """ Create the uri tuple from the :class:`CrawlUri` and calculate the priority. Overwrite this method in more specific frontiers. """ etag = mod_date = None if curi.rep_header: if "Etag" in curi.rep_header: etag = curi.rep_header["Etag"] if "Last-Modified" in curi.rep_header: mod_date = time.mktime(deserialize_date_time( curi.rep_header["Last-Modified"]).timetuple()) if not mod_date and 'Date' in curi.rep_header: mod_date = time.mktime(deserialize_date_time( curi.rep_header["Date"]).timetuple()) if mod_date: # only reschedule if it has been crawled before (prio, next_crawl_date) = self._reschedule_uri(curi) else: (prio, next_crawl_date) = (1, time.mktime(datetime.now(self._timezone).timetuple())) return (curi.url, etag, mod_date, next_crawl_date, prio)
def _uri_from_curi(self, curi): """ Create the uri tuple from the :class:`CrawlUri` and calculate the priority. Overwrite this method in more specific frontiers. """ etag = mod_date = None if curi.rep_header: if "Etag" in curi.rep_header: etag = curi.rep_header["Etag"] if "Last-Modified" in curi.rep_header: mod_date = time.mktime( deserialize_date_time( curi.rep_header["Last-Modified"]).timetuple()) if not mod_date and 'Date' in curi.rep_header: mod_date = time.mktime( deserialize_date_time(curi.rep_header["Date"]).timetuple()) if mod_date: # only reschedule if it has been crawled before (prio, next_crawl_date) = self._reschedule_uri(curi) else: (prio, next_crawl_date) = (1, time.mktime( datetime.now(self._timezone).timetuple())) return (curi.url, etag, mod_date, next_crawl_date, prio)
def __call__(self, msg, out_stream): """ Work on the current `DataMessage` and send the result to `out_stream`. """ # prepare the HTTPHeaders headers = prepare_headers(msg) last_modified = None if msg.curi.req_header: # check if we have a date when the page was last crawled if "Last-Modified" in msg.curi.req_header: last_modified = deserialize_date_time( msg.curi.req_header["Last-Modified"]) # check if we have username and password present auth_username = None auth_password = None if msg.curi.optional_vars and \ CURI_SITE_USERNAME in msg.curi.optional_vars and \ CURI_SITE_PASSWORD in msg.curi.optional_vars: auth_username = msg.curi.optional_vars[CURI_SITE_USERNAME] auth_password = msg.curi.optional_vars[CURI_SITE_PASSWORD] # create the request request = HTTPRequest(msg.curi.effective_url, method="GET", headers=headers, auth_username=auth_username, auth_password=auth_password, if_modified_since=last_modified, follow_redirects=self._follow_redirects, max_redirects=self._max_redirects, user_agent=self._user_agent, request_timeout=self._request_timeout, connect_timeout=self._connect_timeout, validate_cert=self._validate_cert) if hasattr(self, '_proxy_configuration'): request.proxy_host = self._proxy_configuration['host'] request.proxy_port = self._proxy_configuration['port'] request.proxy_username = \ self._proxy_configuration.get('user', None) request.proxy_password = \ self._proxy_configuration.get('password', None) LOG.info("proc.fetch::request for %s" % msg.curi.url) self._client.fetch(request, handle_response(msg, out_stream))
def __call__(self, msg, out_stream): """ Work on the current `DataMessage` and send the result to `out_stream`. """ # prepare the HTTPHeaders headers = prepare_headers(msg) last_modified = None if msg.curi.req_header: # check if we have a date when the page was last crawled if "Last-Modified" in msg.curi.req_header: last_modified = deserialize_date_time( msg.curi.req_header["Last-Modified"]) # check if we have username and password present auth_username = None auth_password = None if msg.curi.optional_vars and \ CURI_SITE_USERNAME in msg.curi.optional_vars and \ CURI_SITE_PASSWORD in msg.curi.optional_vars: auth_username = msg.curi.optional_vars[CURI_SITE_USERNAME] auth_password = msg.curi.optional_vars[CURI_SITE_PASSWORD] # create the request request = HTTPRequest(msg.curi.effective_url, method="GET", headers=headers, auth_username=auth_username, auth_password=auth_password, if_modified_since=last_modified, follow_redirects=self._follow_redirects, max_redirects=self._max_redirects, user_agent=self._user_agent, request_timeout = self._request_timeout, connect_timeout = self._connect_timeout, validate_cert = self._validate_cert) if hasattr(self, '_proxy_configuration'): request.proxy_host = self._proxy_configuration['host'] request.proxy_port = self._proxy_configuration['port'] request.proxy_username = \ self._proxy_configuration.get('user', None) request.proxy_password = \ self._proxy_configuration.get('password', None) LOG.info("proc.fetch::request for %s" % msg.curi.url) self._client.fetch(request, handle_response(msg, out_stream))