def preprocess_url(self, referrer, url): ''' Clean and filter URLs before scraping. ''' ignoreList = ['.pdf', '.jpg', 'tel:', '.dmg'] if not url: return None fields = urlsplit(urljoin( referrer, url))._asdict() # convert to absolute URLs and split fields['path'] = re.sub(r'/$', '', fields['path']) # remove trailing / fields['fragment'] = '' # remove targets within a page fields = SplitResult(**fields) if fields.netloc == self.domain: # Scrape pages of current domain only if fields.scheme == 'http': httpurl = cleanurl = fields.geturl() httpsurl = httpurl.replace('http:', 'https:', 1) else: httpsurl = cleanurl = fields.geturl() httpurl = httpsurl.replace('https:', 'http:', 1) for item in ignoreList: if item in httpsurl or item in httpurl: return None if httpurl not in self.urls and httpsurl not in self.urls: # Return URL only if it's not already in list return cleanurl return None
def normalize(uristr): """ Translate the given URI into a normalized form. :type uristr: unicode :rtype: unicode """ # Strip proxy prefix for proxied URLs for scheme in URL_SCHEMES: if uristr.startswith(VIA_PREFIX + scheme + ":"): uristr = uristr[len(VIA_PREFIX):] break # Try to extract the scheme uri = urlsplit(uristr) # If this isn't a URL, we don't perform any normalization if uri.scheme.lower() not in URL_SCHEMES: return uristr # Don't perform normalization on URLs with no hostname. if uri.hostname is None: return uristr scheme = _normalize_scheme(uri) netloc = _normalize_netloc(uri) path = _normalize_path(uri) query = _normalize_query(uri) fragment = None uri = SplitResult(scheme, netloc, path, query, fragment) return uri.geturl()
def _add_params_to_url(url, params): """Adds parameters as a query part of the URL :param url: URL :type url: string :param params: Dictionary containing parameters :type params: Dict :return: URL with parameters formatted as a query string :rtype: string """ url_parts = urlsplit(url) # Extract the existing parameters specified in the redirection URI existing_params = parse_qs(url_parts.query) # Enrich our custom parameters with the existing ones params.update(existing_params) new_query = urlencode(params, True) url_parts = SplitResult( url_parts.scheme, url_parts.netloc, url_parts.path, new_query, url_parts.fragment, ) url = url_parts.geturl() return url
def url(self): protocol = self.get('HTTP_X_FORWARDED_PROTO') or self.get( 'wsgi.url_scheme', 'http') host = self.get('HTTP_X_FORWARDED_HOST') or self.get('HTTP_HOST') query_params = self.get("QUERY_STRING") url_split_result = SplitResult(protocol, host, self.path, query_params, '') return url_split_result.geturl()
def resolve_url(requested: SplitResult, *, prefix=''): url_ = requested if not url_.path: url_ = SplitResult(*[*requested[:2], '/', *requested[3:]]) guard = fetch.guard_incoming_url(url_, request) if guard: abort(guard) if url_.geturl() != request.path[1:]: if request.query_string: url = urljoin(url_.geturl(), f'?{request.query_string.decode("utf8")}') else: url = url_.geturl() return redirect(f'{request.scheme}://{request.host}{prefix}/{url}', 307) return url_
def guard_incoming_url(requested: SplitResult, flask_request: Request): if requested.scheme not in {'http', 'https'}: if not requested.scheme: query = flask_request.query_string.decode('utf8') requested = f'https:{requested.geturl()}' if query: requested = f'{requested}?{query}' return exceptions.PortalMissingProtocol(requested) return exceptions.PortalUnsupportedScheme(requested.scheme) if not requested.netloc: return exceptions.PortalMissingDomain(requested.geturl()) return None
def preprocess_url(self, referrer, url): if not url: return None fields = urlsplit(urljoin(referrer, url))._asdict() fields['path'] = re.sub(r'/$', '', fields['path']) fields['fragment'] = '' fields = SplitResult(**fields) if fields.netloc == self.domain: if fields.scheme == 'http': httpurl = cleanurl = fields.geturl() httpsurl = httpurl.replace('http:', 'https:', 1) else: httpsurl = cleanurl = fields.geturl() httpurl = httpsurl.replace('https:', 'http:', 1) if httpurl not in self.urls and httpsurl not in self.urls: return cleanurl return None
def preprocess_url(self, referrer, url): ''' Clean and filter URLs before scraping. ''' if not url: return None fields = urlsplit(urljoin(referrer, url))._asdict() fields['path'] = re.sub(r'/$', '', fields['path']) fields['fragment'] = '' fields = SplitResult(**fields) if fields.netloc == self.domain: # Scrape pages of current domain only if fields.scheme == 'http': httpurl = cleanurl = fields.geturl() httpsurl = httpurl.replace('http:', 'https:', 1) else: httpsurl = cleanurl = fields.geturl() httpurl = httpsurl.replace('https:', 'http:', 1) if httpurl not in self.urls and httpsurl not in self.urls: return cleanurl return None
def _get_escaped_full_path(self, request): """ Django considers "safe" some characters that aren't so for oauthlib. We have to search for them and properly escape. """ parsed = list(urlparse(request.get_full_path())) unsafe = set(c for c in parsed[4]).difference(urlencoded) for c in unsafe: parsed[4] = parsed[4].replace(c, quote(c, safe=b"")) uri = urlsplit(urlunparse(parsed)) query = uri.query params = parse_qsl(query) encoded_params = urllib_urlencode(params, doseq=False) parsed_url = SplitResult(uri.scheme, uri.netloc, uri.path, encoded_params, uri.fragment) return parsed_url.geturl()
def parse_origin(url): """ Return the origin of a URL or None if empty or invalid. Per https://tools.ietf.org/html/rfc6454#section-7 : Return ``<scheme> + '://' + <host> + <port>`` for a URL. :param url: URL string :rtype: str or None """ if url is None: return None parsed = urlsplit(url) # netloc contains both host and port origin = SplitResult(parsed.scheme, parsed.netloc, "", "", "") return origin.geturl() or None
def run(self): """ Run node, spawning entity and doing other actions as configured in program arguments. Returns exit code, 1 for failure, 0 for success """ # Wait for entity to exist if wait flag is enabled if self.args.wait: self.entity_exists = False def entity_cb(entity): self.entity_exists = self.args.wait in entity.name self.subscription = self.create_subscription( ModelStates, '%s/model_states' % self.args.gazebo_namespace, entity_cb, 10) self.get_logger().info( 'Waiting for entity {} before proceeding.'.format( self.args.wait)) while rclpy.ok() and not self.entity_exists: rclpy.spin_once(self) pass # Load entity XML from file if self.args.file: self.get_logger().info('Loading entity XML from file %s' % self.args.file) if not os.path.exists(self.args.file): self.get_logger().error( 'Error: specified file %s does not exist', self.args.file) return 1 if not os.path.isfile(self.args.file): self.get_logger().error( 'Error: specified file %s is not a file', self.args.file) return 1 # load file try: f = open(self.args.file, 'r') entity_xml = f.read() except IOError as e: self.get_logger().error('Error reading file {}: {}'.format( self.args.file, e)) return 1 if entity_xml == '': self.get_logger().error('Error: file %s is empty', self.args.file) return 1 # Load entity XML published on topic specified elif self.args.topic: self.get_logger().info('Loading entity published on topic %s' % self.args.topic) entity_xml = '' def entity_xml_cb(msg): nonlocal entity_xml entity_xml = msg.data self.subscription = self.create_subscription( String, self.args.topic, entity_xml_cb, QoSDurabilityPolicy.RMW_QOS_POLICY_DURABILITY_TRANSIENT_LOCAL) while rclpy.ok() and entity_xml == '': self.get_logger().info('Waiting for entity xml on %s' % self.args.topic) rclpy.spin_once(self) pass # Generate entity XML by putting requested entity name into request template elif self.args.database: self.get_logger().info( 'Loading entity XML from Gazebo Model Database') entity_xml = self.MODEL_DATABASE_TEMPLATE.format( self.args.database) elif self.args.stdin: self.get_logger().info('Loading entity XML from stdin') entity_xml = sys.stdin.read() if entity_xml == '': self.get_logger().error('Error: stdin buffer was empty') return 1 # Parse xml to detect invalid xml before sending to gazebo try: xml_parsed = ElementTree.fromstring(entity_xml) except ElementTree.ParseError as e: self.get_logger().error('Invalid XML: {}'.format(e)) return 1 # Replace package:// with model:// for mesh tags if flag is set if self.args.package_to_model: for element in xml_parsed.iterfind('.//mesh'): filename_tag = element.get('filename') if filename_tag is None: continue url = urlsplit(filename_tag) if url.scheme == 'package': url = SplitResult('model', *url[1:]) element.set('filename', url.geturl()) # Encode xml object back into string for service call entity_xml = ElementTree.tostring(xml_parsed) # Form requested Pose from arguments initial_pose = Pose() initial_pose.position.x = float(self.args.x) initial_pose.position.y = float(self.args.y) initial_pose.position.z = float(self.args.z) q = quaternion_from_euler(self.args.R, self.args.P, self.args.Y) initial_pose.orientation.w = q[0] initial_pose.orientation.x = q[1] initial_pose.orientation.y = q[2] initial_pose.orientation.z = q[3] success = self._spawn_entity(entity_xml, initial_pose) if not success: self.get_logger().error('Spawn service failed. Exiting.') return 1 # TODO(shivesh): Wait for /set_model_configuration # (https://github.com/ros-simulation/gazebo_ros_pkgs/issues/779) # Apply joint positions if any specified # if len(self.args.joints) != 0: # joint_names = [joint[0] for joint in self.args.joints] # joint_positions = [joint[1] for joint in self.args.joints] # success = _set_model_configuration(joint_names, joint_positions) # if not success: # self.get_logger().error('SetModelConfiguration service failed. Exiting.') # return 1 # Unpause physics if user requested if self.args.unpause: client = self.create_client( Empty, '%s/unpause_physics' % self.args.gazebo_namespace) if client.wait_for_service(timeout_sec=self.args.timeout): self.get_logger().info('Calling service %s/unpause_physics' % self.args.gazebo_namespace) client.call_async(Empty.Request()) else: self.get_logger().error( 'Service %s/unpause_physics unavailable. \ Was Gazebo started with GazeboRosInit?' ) # If bond enabled, setup shutdown callback and wait for shutdown if self.args.bond: self.get_logger().info( 'Waiting for shutdown to delete entity [{}]'.format( self.args.entity)) try: rclpy.spin(self) except KeyboardInterrupt: self.get_logger().info('Ctrl-C detected') self._delete_entity() return 0
def run(self): ''' Run node, spawning model and doing other actions as configured in program arguments. Returns exit code, 1 for failure, 0 for success ''' # Wait for model to exist if wait flag is enabled if self.args.wait: self.model_exists = False def models_cb(models): self.model_exists = self.args.wait in models.name rospy.Subscriber("%s/model_states" % self.args.gazebo_namespace, ModelStates, models_cb) r = rospy.Rate(10) rospy.loginfo('Waiting for model {} before proceeding.'.format( self.args.wait)) while not rospy.is_shutdown() and not self.model_exists: r.sleep() if rospy.is_shutdown(): return 0 # Load model XML from file if self.args.file: rospy.loginfo("Loading model XML from file %s" % self.args.file) if not os.path.exists(self.args.file): rospy.logfatal("Error: specified file %s does not exist", self.args.file) return 1 if not os.path.isfile(self.args.file): rospy.logfatal("Error: specified file %s is not a file", self.args.file) return 1 # load file try: f = open(self.args.file, 'r') model_xml = f.read() except IOError as e: rospy.logerr("Error reading file {}: {}".format( self.args.file, e)) return 1 if model_xml == "": rospy.logerr("Error: file %s is empty", self.args.file) return 1 # Load model XML from ROS param elif self.args.param: rospy.loginfo("Loading model XML from ros parameter %s" % self.args.param) model_xml = rospy.get_param(self.args.param) if model_xml == "": rospy.logerr("Error: param does not exist or is empty") return 1 # Generate model XML by putting requested model name into request template elif self.args.database: rospy.loginfo("Loading model XML from Gazebo Model Database") model_xml = self.MODEL_DATABASE_TEMPLATE.format(self.args.database) elif self.args.stdin: rospy.loginfo("Loading model XML from stdin") model_xml = sys.stdin.read() if model_xml == "": rospy.logerr("Error: stdin buffer was empty") return 1 # Parse xml to detect invalid xml before sending to gazebo try: xml_parsed = xml.etree.ElementTree.fromstring(model_xml) except xml.etree.ElementTree.ParseError as e: rospy.logerr('Invalid XML: {}'.format(e)) return 1 # Replace package:// with model:// for mesh tags if flag is set if self.args.package_to_model: for element in xml_parsed.iterfind('.//mesh'): filename_tag = element.get('filename') if filename_tag is None: continue url = urlsplit(filename_tag) if url.scheme == 'package': url = SplitResult('model', *url[1:]) element.set('filename', url.geturl()) # Encode xml object back into string for service call model_xml = xml.etree.ElementTree.tostring(xml_parsed) # For Python 3 if not isinstance(model_xml, str): model_xml = model_xml.decode(encoding='ascii') # Form requested Pose from arguments initial_pose = Pose() initial_pose.position.x = rospy.get_param('~x_pos') initial_pose.position.y = rospy.get_param('~y_pos') initial_pose.position.z = self.args.z q = quaternion_from_euler(self.args.R, self.args.P, self.args.Y) initial_pose.orientation = Quaternion(*q) # Spawn model using urdf or sdf service based on arguments success = False if self.args.urdf: success = gazebo_interface.spawn_urdf_model_client( self.args.model, model_xml, self.args.robot_namespace, initial_pose, self.args.reference_frame, self.args.gazebo_namespace) elif self.args.sdf: success = gazebo_interface.spawn_sdf_model_client( self.args.model, model_xml, self.args.robot_namespace, initial_pose, self.args.reference_frame, self.args.gazebo_namespace) if not success: rospy.logerr('Spawn service failed. Exiting.') return 1 # Apply joint positions if any specified if len(self.args.joints) != 0: joint_names = [joint[0] for joint in self.args.joints] joint_positions = [joint[1] for joint in self.args.joints] success = gazebo_interface.set_model_configuration_client( self.args.model, "", joint_names, joint_positions, self.args.gazebo_namespace) if not success: rospy.logerr('SetModelConfiguration service failed. Exiting.') return 1 # Unpause physics if user requested if self.args.unpause: rospy.loginfo('Unpausing physics') rospy.wait_for_service('%s/unpause_physics' % self.args.gazebo_namespace) try: unpause_physics = rospy.ServiceProxy( '%s/unpause_physics' % self.args.gazebo_namespace, Empty) unpause_physics() except rospy.ServiceException as e: rospy.logerr( "Unpause physics service call failed: {}".format(e)) return 1 # If bond enabled, setup shutdown callback and wait for shutdown if self.args.bond: rospy.on_shutdown(self._delete_model) rospy.loginfo('Waiting for shutdown to delete model {}'.format( self.args.model)) rospy.spin() return 0
def path_only(url: SplitResult) -> str: return url.geturl()[len(f'{url.scheme}://{url.netloc}'):]
def no_scheme(url: SplitResult) -> str: return url.geturl()[len(f'{url.scheme}:'):]
def url(self): protocol = self.get('HTTP_X_FORWARDED_PROTO') or self.get('wsgi.url_scheme', 'http') host = self.get('HTTP_X_FORWARDED_HOST') or self.get('HTTP_HOST') query_params = self.get("QUERY_STRING") url_split_result = SplitResult(protocol, host, self.path, query_params, '') return url_split_result.geturl()