def knn(df: DataFrame, p: list, k: int, coordSys: str, unique: bool): """ Finds the K nearest neighbors of the query object. The naive implementation here searches through all the the objects in the DataFrame to get the KNN. The nearness of the objects here is decided on the basis of the distance between their centers. Parameters ---------- df : DataFrame Input Dataframe. Must have 3 columns corresponding to the coordinate (x, y, z) if cartesian or (r, theta, phi) f spherical. p : list of float Targeted point for which we want neighbors. k : int Number of neighbours coordSys : str Coordinate system: spherical or cartesian unique : bool Boolean. If true, returns only distinct objects. Default is false. Returns -------- out : DataFrame DataFrame with the coordinates of the k neighbours found. Examples -------- >>> df = spark.read.format("fits")\ .option("hdu", 1)\ .load("../src/test/resources/cartesian_points.fits") Get the 100 closest neighbours around the point [0.2, 0.2, 0.2] >>> K = 100 >>> target = [0.2, 0.2, 0.2] >>> unique = False >>> neighbours = knn(df.select("x", "y", "z"), target, K, "spherical", unique) >>> print(neighbours.count()) 100 You can add back the metadata >>> neighboursWithMeta = df.join(neighbours, ["x", "y", "z"], "left_semi") """ prefix = "com.astrolabsoftware.spark3d" scalapath = "{}.Queries.KNN".format(prefix) scalaclass = load_from_jvm(scalapath) # # To convert python List to Scala Map convpath = "{}.python.PythonClassTag.javaListtoscalaList".format(prefix) conv = load_from_jvm(convpath) out = _java2py(get_spark_context(), scalaclass(df._jdf, conv(p), k, coordSys, unique)) return out
def checkLoadBalancing(df: DataFrame, kind: str = "frac", numberOfElements: int = -1): """ DataFrame containing the weight of each partition. You can choose between outputing the size (number of rows) of each partition or the fractional size (%) to the total number of rows. size of the dataset (in percent). This is useful to check whether the load is correctly balanced. Parameters ---------- df : DataFrame Input DataFrame kind : str print the load balancing in terms of fractional size (kind="frac") or number of rows per partition (kind="size"). Default is "frac". numberOfElements : int (optional). Total number of elements in the DataFrame. Only needed if you choose to output fractional sizes (kind="frac"). If not provided (i.e. default value of -1) and kind="frac", it will be computed (count). Returns ---------- dfout : DataFrame containing the weight of each partition. Examples ---------- Load data >>> df = spark.read.format("fits")\ .option("hdu", 1)\ .load("../src/test/resources/astro_obs.fits") Fake repartitioning in 10 equal sized partitions >>> df = df.repartition(10) Compute the load balancing % >>> df_load = checkLoadBalancing(df, kind="frac") Note that this is a DataFrame, so you can use df.show() Here we will check that the total is indeed 100% >>> val = df_load.select("Load (%)").collect() >>> assert(int(sum([i[0] for i in val])) == 100) Same using number of rows instead of fractional contribution >>> df_load = checkLoadBalancing(df, kind="size") >>> val = df_load.select("Load (#Rows)").collect() >>> assert(int(sum([i[0] for i in val])) == df.count()) """ prefix = "com.astrolabsoftware.spark3d" scalapath = "{}.Checkers.checkLoadBalancing".format(prefix) scalaclass = load_from_jvm(scalapath) dfout = _java2py(get_spark_context(), scalaclass(df._jdf, kind, numberOfElements)) return dfout
def ShellEnvelope(*args) -> JavaObject: """ Binding around ShellEnvelope.scala. For full description, see `$spark3d/src/main/scala/com/spark3d/geometryObjects/ShellEnvelope.scala` The Scala version makes use of several constructors (i.e. with different kinds of argument). In order to mimick this within a single routine, we abstract the arguments of the constructor using the iterable `*args`. There are then 5 possibilities to instantiate a `ShellEnvelope`: Case 1: Defined with a center coordinates, inner and outer radius. args = [x: Double, y: Double, z: Double, isSpherical: Boolean, innerRadius: Double, outerRadius: Double] Case 2: Defined with a center coordinates, and a radius (= a sphere). args = [x: Double, y: Double, z: Double, isSpherical: Boolean, radius: Double] Case 3: Defined with a Point3D, and a radius (= a sphere). args = [p: Point3D(...), radius: Double] Case 4: from another ShellEnvelope args = [shell: ShellEnvelope(...)] Case 5: Null envelope args = [] Returns ---------- shell : ShellEnvelope instance An instance of the class ShellEnvelope. Throw an error if the iterable in the constructor is not understood. Example ---------- >>> from pyspark3d.geometryObjects import Point3D Case 1: Defined with a center coordinates (cart), inner and outer radius. >>> shell_case1 = ShellEnvelope(0.0, 1.0, 1.0, False, 0.5, 1.0) >>> assert("ShellEnvelope" in shell_case1.__str__()) Case 2: Defined with a center coordinates, and a radius (= a sphere). >>> shell_case2 = ShellEnvelope(0.0, 0.0, 0.0, False, 1.0) >>> print(round(shell_case2.getArea(), 1)) 12.6 Case 3: Defined with a Point3D, and a radius (= a sphere). >>> origin = Point3D(0.0, 0.0, 0.0, False) >>> shell_case3 = ShellEnvelope(origin, 1.0) >>> print(shell_case3.intersects(origin)) True Case 4: From another ShellEnvelope >>> shell_case4 = ShellEnvelope(shell_case3) >>> print(shell_case4.isEqual(shell_case3)) True Case 5: The null shell >>> shell_case5 = ShellEnvelope() >>> print(shell_case5.isNull()) True To see all the available methods: >>> print(sorted(shell_case1.__dir__())) # doctest: +NORMALIZE_WHITESPACE ['center', 'contains', 'equals', 'expandBy', 'expandInnerRadius', 'expandOuterRadius', 'expandToInclude', 'getArea', 'getClass', 'getEnvelope', 'getHash', 'hasCenterCloseTo', 'hashCode', 'innerRadius', 'innerRadius_$eq', 'intersects', 'intersectsShell', 'isEqual', 'isNull', 'isPointInShell', 'notify', 'notifyAll', 'outerRadius', 'outerRadius_$eq', 'setToNull', 'toHealpix', 'toHealpix$default$2', 'toString', 'wait'] """ warning = """ There are 5 possibilities to instantiate a `ShellEnvelope`: Case 1: Defined with a center coordinates, inner and outer radius. args = [x: Double, y: Double, z: Double, isSpherical: Boolean, innerRadius: Double, outerRadius: Double] Case 2: Defined with a center coordinates, and a radius (= a sphere). args = [x: Double, y: Double, z: Double, isSpherical: Boolean, radius: Double] Case 3: Defined with a Point3D, and a radius (= a sphere). args = [p: Point3D(...), radius: Double] Case 4: from another ShellEnvelope args = [shell: ShellEnvelope(...)] Case 5: Null envelope args = [] """ scalapath = "com.astrolabsoftware.spark3d.geometryObjects.ShellEnvelope" shell = load_from_jvm(scalapath) # case 6 if len(args) == 0: return shell() # Case 5 elif len(args) == 1: cond_shell = "ShellEnvelope" in args[0].__str__() msg = """ You are trying to instantiate a ShellEnvelope with 1 argument which is not a ShellEnvelope. {} """.format(warning) assert (cond_shell), msg return shell(args[0]) # Case 3 elif len(args) == 2: msg = """ You are trying to instantiate a ShellEnvelope with 2 arguments which are not a Point3D (center) and a float (radius). {} """.format(warning) assert ("Point3D" in args[0].__str__()), msg assert (type(args[1]) is float or type(args[1]) is int), msg return shell(args[0], args[1]) # Case 2 elif len(args) == 5: msg = """ You are trying to instantiate a ShellEnvelope with 5 arguments but there is one or several type mismatch. {} """.format(warning) assert (type(args[0]) == int or type(args[0]) == float), msg assert (type(args[1]) == int or type(args[1]) == float), msg assert (type(args[2]) == int or type(args[2]) == float), msg assert (type(args[3]) == bool), msg assert (type(args[4]) == int or type(args[4]) == float), msg return shell(args[0], args[1], args[2], args[3], args[4]) # Case 1 elif len(args) == 6: msg = """ You are trying to instantiate a ShellEnvelope with 6 arguments but there is one or several type mismatch. {} """.format(warning) assert (type(args[0]) == int or type(args[0]) == float), msg assert (type(args[1]) == int or type(args[1]) == float), msg assert (type(args[2]) == int or type(args[2]) == float), msg assert (type(args[3]) == bool), msg assert (type(args[4]) == int or type(args[4]) == float), msg assert (type(args[5]) == int or type(args[5]) == float), msg return shell(args[0], args[1], args[2], args[3], args[4], args[5]) else: msg = """ Constructor not understood. {} """.format(warning) assert (False), msg
def BoxEnvelope(*args) -> JavaObject: """ Binding around BoxEnvelope.scala. For full description, see `$spark3d/src/main/scala/com/spark3d/geometryObjects/BoxEnvelope.scala` The Scala version makes use of several constructors (i.e. with different kinds of argument). In order to mimick this within a single routine, we abstract the arguments of the constructor using the iterable `*args`. There are then 5 possibilities to instantiate a `BoxEnvelope`: Case 1: from coordinates args = [x1: float, x2: float, y1: float, y2: float, z1: float, z2: float] Case 2: from a single Point3D (i.e. the box is a Point3D) args = [p: Point3D(...)] Case 3: from three Point3D args = [p1: Point3D(...), p2: Point3D(...), p3: Point3D(...)] Case 4: from another BoxEnvelope args = [b: BoxEnvelope(...)] Case 5: Null envelope args = [] Coordinates of input Point3D MUST be cartesian. Returns ---------- box : BoxEnvelope instance An instance of the class BoxEnvelope. Throw an error if the iterable in the constructor is not understood. Example ---------- >>> from pyspark3d.geometryObjects import Point3D Case 1: Cube from coordinates >>> box_case1 = BoxEnvelope(0.0, 1.0, 0.0, 1.0, 0.0, 1.0) >>> print(box_case1.__str__()) Env[0.0 : 1.0, 0.0 : 1.0, 0.0 : 1.0, ] Case 2: Zero volume >>> p3d = Point3D(0.0, 0.0, 0.0, False) >>> box_case2 = BoxEnvelope(p3d) >>> print(box_case2.getVolume()) 0.0 Case 3: Cube from 3 Point3D >>> p3d_1 = Point3D(0.0, 1.0, 0.0, False) >>> p3d_2 = Point3D(0.1, 1.0, 0.0, False) >>> p3d_3 = Point3D(1.0, -1.0, 1.0, False) >>> origin = Point3D(0.0, 0.0, 0.0, False) >>> box_case3 = BoxEnvelope(p3d_1, p3d_2, p3d_3) >>> print(box_case3.contains(origin)) True Case 4: From another envelope >>> box_case4 = BoxEnvelope(box_case3) >>> print(box_case4.isEqual(box_case3)) True Case 5: The null cube >>> box_case5 = BoxEnvelope() >>> print(box_case5.isNull()) True To see all the available methods: >>> print(sorted(box_case1.__dir__())) # doctest: +NORMALIZE_WHITESPACE ['apply', 'center', 'contains', 'covers', 'distance', 'equals', 'expandBy', 'expandOutwards', 'expandToInclude', 'getClass', 'getEnvelope', 'getHash', 'getVolume', 'getXLength', 'getYLength', 'getZLength', 'hasCenterCloseTo', 'hashCode', 'indexID', 'indexID_$eq', 'intersection', 'intersects', 'intersectsBox', 'intersectsRegion', 'isEqual', 'isNull', 'maxExtent', 'maxX', 'maxX_$eq', 'maxY', 'maxY_$eq', 'maxZ', 'maxZ_$eq', 'minExtent', 'minX', 'minX_$eq', 'minY', 'minY_$eq', 'minZ', 'minZ_$eq', 'notify', 'notifyAll', 'setToNull', 'toHealpix', 'toHealpix$default$2', 'toString', 'translate', 'wait'] """ warning = """ There are 5 possibilities to instantiate a `BoxEnvelope`: Case 1: from coordinates args = [x1: float, x2: float, y1: float, y2: float, z1: float, z2: float] Case 2: from a single Point3D (i.e. the box is a Point3D) args = [p: Point3D(...)] Case 3: from three Point3D args = [p1: Point3D(...), p2: Point3D(...), p3: Point3D(...)] Case 4: from another BoxEnvelope args = [b: BoxEnvelope(...)] Case 5: Null envelope args = [] """ scalapath = "com.astrolabsoftware.spark3d.geometryObjects.BoxEnvelope" box = load_from_jvm(scalapath) # case 6 if len(args) == 0: return box() # Case 2 or 4 elif len(args) == 1: cond_p3d = "Point3D" in args[0].__str__() cond_box = "Env" in args[0].__str__() msg = """ You are trying to instantiate a BoxEnvelope with 1 argument which is neither a Point3D nor a BoxEnvelope. {} """.format(warning) assert (cond_p3d or cond_box), msg return box(args[0]) # Case 3 elif len(args) == 3: msg = """ You are trying to instantiate a BoxEnvelope with 3 arguments and one at least is not a Point3D. {} """.format(warning) for arg in args: assert ("Point3D" in arg.__str__()), msg return box(args[0], args[1], args[2]) # Case 3 elif len(args) == 6: msg = """ You are trying to instantiate a BoxEnvelope with 3 arguments and one at least is not a Point3D. {} """.format(warning) for arg in args: assert (type(arg) == int or type(arg) == float), msg return box(args[0], args[1], args[2], args[3], args[4], args[5]) else: msg = """ Constructor not understood. {} """.format(warning) assert (False), msg
def Point3D(x: float, y: float, z: float, isSpherical: bool) -> JavaObject: """ Binding around Point3D.scala. For full description, see `$spark3d/src/main/scala/com/spark3d/geometryObjects/Point3D.scala`. By default, the input coordinates are supposed euclidean, that is (x, y, z). The user can also work with spherical input coordinates (x=r, y=theta, z=phi) by setting the argument isSpherical=true. Parameters ---------- x : float Input X coordinate in Euclidean space, and R in spherical space. y : float Input Y coordinate in Euclidean space, and THETA in spherical space. z : float Input Z coordinate in Euclidean space, and PHI in spherical space. isSpherical : bool If true, it assumes that the coordinates of the Point3D are (r, theta, phi). Otherwise, it assumes cartesian coordinates (x, y, z). Returns ---------- p3d : Point3D instance An instance of the class Point3D. Example ---------- Instantiate a point with spherical coordinates (r, theta, phi) >>> p3d = Point3D(1.0, np.pi, 0.0, True) The returned type is JavaObject (Point3D instance) >>> print(type(p3d)) <class 'py4j.java_gateway.JavaObject'> You can then call the method associated, for example >>> p3d.getVolume() 0.0 Return the point coordinates >>> p3d = Point3D(1.0, 1.0, 0.0, False) >>> p3d.getCoordinatePython() [1.0, 1.0, 0.0] It will be a JavaList by default >>> coord = p3d.getCoordinatePython() >>> print(type(coord)) <class 'py4j.java_collections.JavaList'> Make it a python list >>> coord_python = list(coord) >>> print(type(coord_python)) <class 'list'> [Astro] Convert the (theta, phi) in Healpix pixel index: >>> p3d = Point3D(1.0, np.pi, 0.0, True) # (z, theta, phi) >>> p3d.toHealpix(2048, True) 50331644 To see all the available methods: >>> print(sorted(p3d.__dir__())) # doctest: +NORMALIZE_WHITESPACE ['center', 'distanceTo', 'equals', 'getClass', 'getCoordinate', 'getCoordinatePython', 'getEnvelope', 'getHash', 'getVolume', 'hasCenterCloseTo', 'hashCode', 'intersects', 'isEqual', 'isSpherical', 'notify', 'notifyAll', 'toHealpix', 'toHealpix$default$2', 'toString', 'wait', 'x', 'y', 'z'] """ scalapath = "com.astrolabsoftware.spark3d.geometryObjects.Point3D" p3d = load_from_jvm(scalapath) return p3d(x, y, z, isSpherical)
def windowquery(df: DataFrame, windowtype: str, windowcoord: int, coordSys: str): """ Perform window query, that is match between DF elements and a user-defined window (point, sphere, shell, box). If windowtype = - point: windowcoord = [x, y, z] - sphere: windowcoord = [x, y, z, R] - shell: windowcoord = [x, y, z, Rin, Rout] - box: windowcoord = [x1, y1, z1, x2, y2, z2, x3, y3, z3] Use [x, y, z] for cartesian or [r, theta, phi] for spherical. Note that box only accepts cartesian coordinates. Parameters ---------- df : DataFrame Input Dataframe. Must have 3 columns corresponding to the coordinate (x, y, z) if cartesian or (r, theta, phi) f spherical. windowtype : str point, shell, sphere, or box. windowcoord : list of float List of Doubles. The coordinates of the window (see doc above). coordSys : str Coordinate system: spherical or cartesian Returns -------- out : DataFrame DataFrame with the coordinates of the objects found in the window Examples -------- >>> df = spark.read.format("csv")\ .option("inferSchema", True)\ .option("header", True)\ .load("../src/test/resources/cartesian_spheres_manual.csv") Point-like window >>> windowtype = "point" >>> windowcoord = [1.0, 1.0, 1.0] >>> env = windowquery(df.select("x", "y", "z"), windowtype, windowcoord, "cartesian") >>> print(env.count()) 2 You can add back the metadata >>> envWithMeta = df.join(env, ["x", "y", "z"], "left_semi") Sphere-like window >>> windowtype = "sphere" >>> windowcoord = [1.0, 1.0, 1.0, 2.0] >>> env = windowquery(df.select("x", "y", "z"), windowtype, windowcoord, "cartesian") >>> print(env.count()) 3 Shell-like window >>> windowtype = "shell" >>> windowcoord = [1.0, 1.0, 1.0, 0.0, 2.0] >>> env = windowquery(df.select("x", "y", "z"), windowtype, windowcoord, "cartesian") >>> print(env.count()) 3 Box-like window >>> windowtype = "box" >>> windowcoord = [2.0, 0.0, 0.0, 0.0, 2.0, 0.0, 0.0, 0.0, 2.0] >>> env = windowquery(df.select("x", "y", "z"), windowtype, windowcoord, "cartesian") >>> print(env.count()) 2 """ prefix = "com.astrolabsoftware.spark3d" scalapath = "{}.Queries.windowQuery".format(prefix) scalaclass = load_from_jvm(scalapath) # # To convert python List to Scala Map convpath = "{}.python.PythonClassTag.javaListtoscalaList".format(prefix) conv = load_from_jvm(convpath) out = _java2py( get_spark_context(), scalaclass(df._jdf, windowtype, conv(windowcoord), coordSys)) return out
def repartitionByCol(df: DataFrame, colname: str, preLabeled: bool, numPartitions: int = -1): """Repartition a DataFrame according to a column containing partition ID. Note this is not re-ordering elements, but making new partitions with objects having the same partition ID defined by one of the DataFrame column (i.e. shuffling). Parameters ---------- df : DataFrame Input DataFrame. colname : str Column name describing the repartitioning. Typically Ints. preLabeled : bool True means the column containing the partition ID contains already numbers from 0 to `numPartitions - 1`. false otherwise. Note that in the latter, the execution time will be longer as we need to map column values to partition ID. numPartitions : int (optional )Number of partitions. If not provided the code will guess the number of partitions by counting the number of distinct elements of the repartitioning column. As it can be costly, you can provide manually this information. Returns --------- dfout : DataFrame Repartitioned input DataFrame. Examples --------- Load data >>> df = spark.read.format("fits")\ .option("hdu", 1)\ .load("../src/test/resources/astro_obs.fits") Specify options >>> options = { ... "geometry": "points", ... "colnames": "Z_COSMO,RA,DEC", ... "coordSys": "spherical", ... "gridtype": "onion"} Add a column containing the partitioning (Onion) >>> df_colid = prePartition(df, options, 10) >>> print(df_colid.select("partition_id").distinct().count()) 10 Trigger the repartitioning >>> df_repart = repartitionByCol(df_colid, "partition_id", True, 10) >>> def mapLen(part): yield len([*part]) >>> df_repart.rdd.mapPartitions(mapLen).take(1)[0] 2104 """ prefix = "com.astrolabsoftware.spark3d" scalapath = "{}.Repartitioning.repartitionByCol".format(prefix) scalaclass = load_from_jvm(scalapath) dfout = _java2py(get_spark_context(), scalaclass(df._jdf, colname, preLabeled, numPartitions)) return dfout
def prePartition(df: DataFrame, options: Dict = {"": ""}, numPartitions: int = -1): """Add a DataFrame column describing the partitioning. This method allows to use a custom partitioner (SpatialPartitioner). Note that no data movement (shuffle) is performed yet here, as we just describe how the repartitioning should be done. Use `partitionBy` to trigger it. `options` must contain four entries: - gridtype: the type of repartitioning. Available: current (no repartitioning), onion, octree. - geometry: geometry of objects: points, spheres, or boxes - coordSys: coordinate system: spherical or cartesian - colnames: comma-separated names of the spatial coordinates. For points, must be "x,y,z" or "r,theta,phi". For spheres, must be "x,y,z,R" or "r,theta,phi,R". Parameters ---------- df : DataFrame Input DataFrame options : Dictionary of Strings Dictionary containing metadata (see above). numPartitions : int (optional) The number of partitions wanted. -1 by default, i.e. the number of partitions of the input DataFrame. Returns ---------- dfout : DataFrame Input DataFrame plus an additional column `partition_id`. Examples ---------- Load data >>> df = spark.read.format("fits")\ .option("hdu", 1)\ .load("../src/test/resources/astro_obs.fits") Specify options >>> options = { ... "geometry": "points", ... "colnames": "Z_COSMO,RA,DEC", ... "coordSys": "spherical", ... "gridtype": "onion"} Add a column containing the partitioning (Onion) >>> df_colid = prePartition(df, options, 10) >>> print(df_colid.select("partition_id").distinct().count()) 10 Note that you can also return the current partitioning: >>> options = { ... "geometry": "points", ... "colnames": "Z_COSMO,RA,DEC", ... "coordSys": "spherical", ... "gridtype": "current"} >>> df_colid = prePartition(df, options) >>> assert(df_colid.select("partition_id").distinct().count() == df.rdd.getNumPartitions()) """ prefix = "com.astrolabsoftware.spark3d" scalapath = "{}.Repartitioning.prePartition".format(prefix) scalaclass = load_from_jvm(scalapath) # To convert python dic to Scala Map convpath = "{}.python.PythonClassTag.javaHashMaptoscalaMap".format(prefix) conv = load_from_jvm(convpath) dfout = _java2py(get_spark_context(), scalaclass(df._jdf, conv(options), numPartitions)) return dfout