def _jrdd(self): if self._jrdd_val: return self._jrdd_val func = self.func if not self._bypass_serializer and self.ctx.batchSize != 1: oldfunc = self.func batchSize = self.ctx.batchSize def batched_func(split, iterator): return batched(oldfunc(split, iterator), batchSize) func = batched_func cmds = [func, self._bypass_serializer] pipe_command = ' '.join(b64enc(cloudpickle.dumps(f)) for f in cmds) broadcast_vars = ListConverter().convert( [x._jbroadcast for x in self.ctx._pickled_broadcast_vars], self.ctx._gateway._gateway_client) self.ctx._pickled_broadcast_vars.clear() class_manifest = self._prev_jrdd.classManifest() env = MapConverter().convert(self.ctx.environment, self.ctx._gateway._gateway_client) includes = ListConverter().convert(self.ctx._python_includes, self.ctx._gateway._gateway_client) python_rdd = self.ctx._jvm.PythonRDD( self._prev_jrdd.rdd(), pipe_command, env, includes, self.preservesPartitioning, self.ctx.pythonExec, broadcast_vars, self.ctx._javaAccumulator, class_manifest) self._jrdd_val = python_rdd.asJavaRDD() return self._jrdd_val
def _py2java(sc, obj): """ Convert Python object into Java """ if isinstance(obj, RDD): obj = _to_java_object_rdd(obj) elif isinstance(obj, DataFrame): obj = obj._jdf elif isinstance(obj, SparkContext): obj = obj._jsc elif isinstance(obj, (list, tuple)): obj = ListConverter().convert([_py2java(sc, x) for x in obj], sc._gateway._gateway_client) elif isinstance(obj, dict): result = {} print(obj.keys()) for (key, value) in obj.items(): result[key] = _py2java(sc, value) obj = MapConverter().convert(result, sc._gateway._gateway_client) elif isinstance(obj, JavaValue): obj = obj.value elif isinstance(obj, JavaObject): pass elif isinstance(obj, (int, long, float, bool, bytes, unicode)): pass else: data = bytearray(PickleSerializer().dumps(obj)) obj = sc._jvm.org.apache.spark.bigdl.api.python.BigDLSerDe.loads(data) return obj
def jvertex_rdd(self): if self.jvrdd_val: return self.jvrdd_val if self.bypass_serializer: self.jvertex_rdd_deserializer = NoOpSerializer() # enable_profile = self.ctx._conf.get("spark.python.profile", "false") == "true" # profileStats = self.ctx.accumulator(None, PStatsParam) if enable_profile else None command = (self.func, profileStats, self.prev_jvertex_rdd_deserializer, self.jvertex_rdd_deserializer) # the serialized command will be compressed by broadcast ser = CloudPickleSerializer() pickled_command = ser.dumps(command) if len(pickled_command) > (1 << 20): # 1M self.broadcast = self.ctx.broadcast(pickled_command) pickled_command = ser.dumps(self.broadcast) broadcast_vars = ListConverter().convert( [x._jbroadcast for x in self.ctx._pickled_broadcast_vars], self.ctx._gateway._gateway_client) self.ctx._pickled_broadcast_vars.clear() env = MapConverter().convert(self.ctx.environment, self.ctx._gateway._gateway_client) includes = ListConverter().convert(self.ctx._python_includes, self.ctx._gateway._gateway_client) java_storage_level = self.ctx._getJavaStorageLevel( StorageLevel.MEMORY_ONLY) python_rdd = self.ctx._jvm.PythonVertexRDD( self.prev_jvertex_rdd, bytearray(pickled_command), env, includes, self.preservesPartitioning, self.ctx.pythonExec, broadcast_vars, self.ctx._javaAccumulator, java_storage_level) self.jvrdd_val = python_rdd.asJavaVertexRDD() if enable_profile: self.id = self.jvrdd_val.id() self.ctx._add_profile(self.id, profileStats) return self.jvrdd_val
def saveAndOpenSelection(df, dsName:str, objectClassIdx:int, selectionName:str, showObjects:bool=False, showTracks:bool=False, openSelection:bool=False, objectClassIdxDisplay:int=-1, interactiveObjectClassIdx:int=-1, port=25335, python_proxy_port:int=25334, address='127.0.0.1', gateway_parameters={}): """Stores a selection to bacmman using python gateway (py4j). Bacmman must be running with an active python gateway server. Parameters ---------- df : pandas DataFrame each line of the DataFrame is one element of the selection, defined by columns Indices & Position dsName : str bacmman dataset name to store the selection to. objectClassIdx : int index of the object class of the elements of the selection in the bacmman dataset selectionName : str name of the selection showObjects : bool whether contours of objects should be shown showTracks : bool whether track links of objects should be shown openSelection : bool whether the first kymograph of the selection should be open objectClassIdxDisplay : int if openSelection is true, object class idx of the opened kymograph interactiveObjectClassIdx : int if openSelection is true, interactive object class idx python_proxy_port : int python port of the java gateway """ gateway = JavaGateway(python_proxy_port=python_proxy_port, gateway_parameters=GatewayParameters(address=address, port=port, **gateway_parameters)) try: idx = ListConverter().convert(df.Indices.tolist(), gateway._gateway_client) pos = ListConverter().convert(df.Position.tolist(), gateway._gateway_client) gateway.saveCurrentSelection(dsName, objectClassIdx, selectionName, idx, pos, showObjects, showTracks, openSelection, False, objectClassIdxDisplay, interactiveObjectClassIdx) except Py4JNetworkError: print("Could not connect, is BACMMAN started?")
def _py2java(sc, obj): """ Convert Python object into Java """ if isinstance(obj, RDD): obj = _to_java_object_rdd(obj) elif isinstance(obj, DataFrame): obj = obj._jdf elif isinstance(obj, SparkContext): obj = obj._jsc elif isinstance(obj, (list, tuple)): obj = ListConverter().convert([_py2java(sc, x) for x in obj], sc._gateway._gateway_client) elif isinstance(obj, dict): result = {} for (key, value) in obj.iteritems(): result[key] = _py2java(sc, value) if isinstance(value, JavaValue) else value # noqa obj = result elif isinstance(obj, JavaValue): obj = obj.value elif isinstance(obj, JavaObject): pass elif isinstance(obj, (int, long, float, bool, bytes, unicode)): pass else: data = bytearray(PickleSerializer().dumps(obj)) obj = sc._jvm.org.apache.spark.bigdl.api.python.BigDLSerDe.loads(data) return obj
def _jdstream(self): if self._jdstream_val: return self._jdstream_val if self._bypass_serializer: self.jrdd_deserializer = NoOpSerializer() command = (self.func, self._prev_jrdd_deserializer, self._jrdd_deserializer) # the serialized command will be compressed by broadcast ser = CloudPickleSerializer() pickled_command = ser.dumps(command) if pickled_command > (1 << 20): # 1M broadcast = self.ctx.broadcast(pickled_command) pickled_command = ser.dumps(broadcast) broadcast_vars = ListConverter().convert( [x._jbroadcast for x in self.ctx._pickled_broadcast_vars], self.ctx._gateway._gateway_client) self.ctx._pickled_broadcast_vars.clear() class_tag = self._prev_jdstream.classTag() env = MapConverter().convert(self.ctx.environment, self.ctx._gateway._gateway_client) includes = ListConverter().convert(self.ctx._python_includes, self.ctx._gateway._gateway_client) python_dstream = self.ctx._jvm.PythonDStream( self._prev_jdstream.dstream(), bytearray(pickled_command), env, includes, self.preservesPartitioning, self.ctx.pythonExec, broadcast_vars, self.ctx._javaAccumulator, class_tag) self._jdstream_val = python_dstream.asJavaDStream() return self._jdstream_val
def setTerms(self, datum): ''' :param datum: list of Double ''' assert type(datum)==list javalist_datum = ListConverter().convert(datum, gateway._gateway_client) self.java_kbv.setTerms(javalist_datum)
def _api(self, *args): jargs = ListConverter().convert( args, self.sql_ctx._sc._gateway._gateway_client) name = f.__name__ jdf = getattr(self._jdf, name)(self.sql_ctx._sc._jvm.PythonUtils.toSeq(jargs)) return DataFrame(jdf, self.sql_ctx)
def union(self, rdds): """ Build the union of a list of RDDs. This supports unions() of RDDs with different serialized formats, although this forces them to be reserialized using the default serializer: >>> path = os.path.join(tempdir, "union-text.txt") >>> with open(path, "w") as testFile: ... testFile.write("Hello") >>> textFile = sc.textFile(path) >>> textFile.collect() [u'Hello'] >>> parallelized = sc.parallelize(["World!"]) >>> sorted(sc.union([textFile, parallelized]).collect()) [u'Hello', 'World!'] """ first_jrdd_deserializer = rdds[0]._jrdd_deserializer if any(x._jrdd_deserializer != first_jrdd_deserializer for x in rdds): rdds = [x._reserialize() for x in rdds] first = rdds[0]._jrdd rest = [x._jrdd for x in rdds[1:]] rest = ListConverter().convert(rest, self._gateway._gateway_client) return RDD(self._jsc.union(first, rest), self, rdds[0]._jrdd_deserializer)
def genTunnel(mapdata): '''Takes /map topic data and generates a tunnel in the correct format for the Sentel navigation engine. I see no easy way modifying explorer as a global as this must return void. The only other possibility would be to publish this to a new topic and path plan in another subscription service. We will do this if we notice data is getting corrupted or the state of explorer is unclear''' global explorer,ways,pubway,rtl_pub print('I have the nav_map') size = mapdata.info.width data = mapdata.data explorer.initializeArray(size) explorer.importHectorList(ListConverter().convert(data,gateway._gateway_client),1) shortestdistance = 3 maxrange = 40 p = explorer.findClosestFrontier(size/2,size/2,maxrange,shortestdistance) ways = Waypoints() x_ways, y_ways = [], [] if p is None: rospy.loginfo("No path found! Go home!.") rtl_pub.publish(True) return for pp in p: x_ways.append(int(pp.getX())) #Possible tranpose, but map_viewer works currently y_ways.append(int(pp.getY())) ways.x = x_ways ways.y = y_ways pubway.publish(ways) #publishing x,y gridpoints
def runJob(self, rdd, partitionFunc, partitions=None, allowLocal=False): """ Executes the given partitionFunc on the specified set of partitions, returning the result as an array of elements. If 'partitions' is not specified, this will run over all partitions. >>> myRDD = sc.parallelize(range(6), 3) >>> sc.runJob(myRDD, lambda part: [x * x for x in part]) [0, 1, 4, 9, 16, 25] >>> myRDD = sc.parallelize(range(6), 3) >>> sc.runJob(myRDD, lambda part: [x * x for x in part], [0, 2], True) [0, 1, 16, 25] """ if partitions is None: partitions = range(rdd._jrdd.partitions().size()) javaPartitions = ListConverter().convert(partitions, self._gateway._gateway_client) # Implementation note: This is implemented as a mapPartitions followed # by runJob() in order to avoid having to pass a Python lambda into # SparkContext#runJob. mappedRDD = rdd.mapPartitions(partitionFunc) it = self._jvm.PythonRDD.runJob(self._jsc.sc(), mappedRDD._jrdd, javaPartitions, allowLocal) return list(mappedRDD._collect_iterator_through_file(it))
def agg(self, *exprs): """ Compute aggregates by specifying a map from column name to aggregate methods. The available aggregate methods are `avg`, `max`, `min`, `sum`, `count`. :param exprs: list or aggregate columns or a map from column name to aggregate methods. >>> gdf = df.groupBy(df.name) >>> gdf.agg({"*": "count"}).collect() [Row(name=u'Bob', COUNT(1)=1), Row(name=u'Alice', COUNT(1)=1)] >>> from pyspark.sql import functions as F >>> gdf.agg(F.min(df.age)).collect() [Row(MIN(age#0)=5), Row(MIN(age#0)=2)] """ assert exprs, "exprs should not be empty" if len(exprs) == 1 and isinstance(exprs[0], dict): jmap = MapConverter().convert( exprs[0], self.sql_ctx._sc._gateway._gateway_client) jdf = self._jdf.agg(jmap) else: # Columns assert all(isinstance(c, Column) for c in exprs), "all exprs should be Column" jcols = ListConverter().convert( [c._jc for c in exprs[1:]], self.sql_ctx._sc._gateway._gateway_client) jdf = self._jdf.agg(exprs[0]._jc, self.sql_ctx._sc._jvm.PythonUtils.toSeq(jcols)) return DataFrame(jdf, self.sql_ctx)
def addToClusterer(stream, javaFile, ids): # get data into java jIds = ListConverter().convert(ids, gateway._gateway_client) util.thisLogger.logInfo(javaFile) gateway.entry_point.Moa_Clusterers_Outliers_Mcod_AddCsvDataToStream( stream, javaFile, jIds)
def getAttributeNames(flatActivations): # create the attribute names for the flat data attributeNames = [str(i) for i in range(0, len(flatActivations[0]))] # this is the first time we attemp to connect to moa gateway, check the connection checkConnection() jAttributeNames = ListConverter().convert(attributeNames, gateway._gateway_client) return jAttributeNames
def union(self, rdds): """ Build the union of a list of RDDs. """ first = rdds[0]._jrdd rest = [x._jrdd for x in rdds[1:]] rest = ListConverter().convert(rest, self.gateway._gateway_client) return RDD(self._jsc.union(first, rest), self)
def fillna(self, value, subset=None): """Replace null values, alias for ``na.fill()``. :param value: int, long, float, string, or dict. Value to replace null values with. If the value is a dict, then `subset` is ignored and `value` must be a mapping from column name (string) to replacement value. The replacement value must be an int, long, float, or string. :param subset: optional list of column names to consider. Columns specified in subset that do not have matching data type are ignored. For example, if `value` is a string, and subset contains a non-string column, then the non-string column is simply ignored. >>> df4.fillna(50).show() age height name 10 80 Alice 5 50 Bob 50 50 Tom 50 50 null >>> df4.fillna({'age': 50, 'name': 'unknown'}).show() age height name 10 80 Alice 5 null Bob 50 null Tom 50 null unknown >>> df4.na.fill({'age': 50, 'name': 'unknown'}).show() age height name 10 80 Alice 5 null Bob 50 null Tom 50 null unknown """ if not isinstance(value, (float, int, long, basestring, dict)): raise ValueError( "value should be a float, int, long, string, or dict") if isinstance(value, (int, long)): value = float(value) if isinstance(value, dict): value = MapConverter().convert( value, self.sql_ctx._sc._gateway._gateway_client) return DataFrame(self._jdf.na().fill(value), self.sql_ctx) elif subset is None: return DataFrame(self._jdf.na().fill(value), self.sql_ctx) else: if isinstance(subset, basestring): subset = [subset] elif not isinstance(subset, (list, tuple)): raise ValueError( "subset should be a list or tuple of column names") cols = ListConverter().convert( subset, self.sql_ctx._sc._gateway._gateway_client) cols = self.sql_ctx._sc._jvm.PythonUtils.toSeq(cols) return DataFrame(self._jdf.na().fill(value, cols), self.sql_ctx)
def generate_xlog(traces_id, activities, groups, times): # Convert lists to Java compatible format traces_id_java = ListConverter().convert(traces_id, gateway._gateway_client) activities_java = [] for activity in activities: activities_java.append(ListConverter().convert( activity, gateway._gateway_client)) activities_java = ListConverter().convert(activities_java, gateway._gateway_client) groups_java = [] for group in groups: groups_java.append(ListConverter().convert(group, gateway._gateway_client)) groups_java = ListConverter().convert(groups_java, gateway._gateway_client) times_java = [] for time in times: times_java.append(ListConverter().convert(time, gateway._gateway_client)) times_java = ListConverter().convert(times_java, gateway._gateway_client) verificator_app.generateXLog(traces_id_java, activities_java, groups_java, times_java)
def getSupportedFiletypes(self): types = self.get_supported_filetypes_grouped() converted = {} for k, v in types.items(): converted[k] = ListConverter().convert( v, Gateway.gateway._gateway_client) return MapConverter().convert(converted, Gateway.gateway._gateway_client)
def getAllDevices(self, ids=None): devmap = None if ids == None: devmap = self.entry.getAllDevices() else: java_list = ListConverter().convert(ids, self.gateway._gateway_client) devmap = self.entry.getAllDevices(java_list) return devmap
def getAxisLimits(self): limits = list() for ax in self.figure.axes: if ax.lines or ax.collections or ax.patches: # skip "empty" axes limits += [*ax.get_xlim(), *ax.get_ylim()] java_list = ListConverter().convert(limits, Gateway.gateway._gateway_client) return java_list
def get_item_based_recommendations(self, paper_id_list): java_paper_id_list = ListConverter().convert( paper_id_list, self.gateway._gateway_client) recs = self.gateway.entry_point.recommend(java_paper_id_list) res = [] for rec in recs: r = rec.split(',') res.append({'id': r[0], 'score': float(r[1])}) return res
def selectExpr(self, *expr): """ Selects a set of SQL expressions. This is a variant of `select` that accepts SQL expressions. >>> df.selectExpr("age * 2", "abs(age)").collect() [Row((age * 2)=4, Abs(age)=2), Row((age * 2)=10, Abs(age)=5)] """ jexpr = ListConverter().convert(expr, self._sc._gateway._gateway_client) jdf = self._jdf.selectExpr(self._sc._jvm.PythonUtils.toSeq(jexpr)) return DataFrame(jdf, self.sql_ctx)
def tag_token(self, token: str) -> [str]: """ Tag a single token using the tagger of a language. All valid tags for the token are returned. """ tokens = ListConverter().convert([token], self.gateway._gateway_client) tags = list( set( LanguageTool._get_tags_of_tagged_tokens( self.tagger.tag(tokens)[0]))) tags.sort() # hm, "Zusammenhänge" gets tags more than once return tags
def _to_seq(sc, cols, converter=None): """ Convert a list of Column (or names) into a JVM Seq of Column. An optional `converter` could be used to convert items in `cols` into JVM Column objects. """ if converter: cols = [converter(c) for c in cols] jcols = ListConverter().convert(cols, sc._gateway._gateway_client) return sc._jvm.PythonUtils.toSeq(jcols)
def getVectorOps(self): from omnetpp.scave import vectorops ops = [] # ctor is: VectorOp(String module, String name, String signature, String docstring, String label, String example) for o in vectorops._report_ops(): try: ops.append(Gateway.gateway.jvm.org.omnetpp.scave.editors.VectorOperations.VectorOp(*o)) except Exception as E: print("Exception while processing vector operation:", o[4]) return ListConverter().convert(ops, Gateway.gateway._gateway_client)
def __init__(self, factories: Union[Callable[..., None], Iterable[Callable[..., None]]] = None, count: int = None, name: str = None, data_generator: Iterator[Dict[str, Any]] = None, initial_state: Dict[str, vertex_constructor_param_types] = None): builder = k.jvm_view().SequenceBuilder().named(name) if initial_state is not None: initial_state_java = k.to_java_map( {_VertexLabel(k): cast_to_double_vertex(v).unwrap() for (k, v) in initial_state.items()}) vertex_dictionary = k.jvm_view().SimpleVertexDictionary.backedBy(initial_state_java) builder = builder.withInitialState(vertex_dictionary) if count is None and data_generator is None: raise ValueError( "Cannot create a sequence of an unknown size: you must specify either a count of a data_generator") elif count is not None and data_generator is not None: raise ValueError("If you pass in a data_generator you cannot also pass in a count") elif factories is None: raise ValueError("You must provide a value for the 'factories' input") if not isinstance(factories, CollectionsIterable): factories = [factories] if count is not None: functions = [Consumer(partial(lambda f, p: f(SequenceItem(p)), f)) for f in factories] java_functions = ListConverter().convert(functions, k._gateway._gateway_client) builder = builder.count(count).withFactories(java_functions) if data_generator is not None: bifunctions = [BiConsumer(partial(lambda f, p, data: f(SequenceItem(p), data), f)) for f in factories] java_bifunctions = ListConverter().convert(bifunctions, k._gateway._gateway_client) data_generator_java = (k.to_java_map(m) for m in data_generator) builder = builder.fromIterator(JavaIterator(data_generator_java)).withFactories(java_bifunctions) sequence = builder.build() super().__init__(sequence)
def registerFunction(self, name, f, returnType="string"): def func(split, iterator): return imap(f, iterator) command = (func, self._sc.serializer, self._sc.serializer) env = MapConverter().convert(self._sc.environment, self._sc._gateway._gateway_client) includes = ListConverter().convert(self._sc._python_includes, self._sc._gateway._gateway_client) self._ssql_ctx.registerPython( name, bytearray(CloudPickleSerializer().dumps(command)), env, includes, self._sc.pythonExec, self._sc._javaAccumulator, returnType)
def __init__(self, name=None, terms=None): ''' :param name: the name :param terms: list of Double ''' if name==None and terms==None: self.java_kbv = gateway.entry_point.getJFMLKnowledgebaseVariable_Factory().createAnYaDataCloudType() elif name!=None and terms==None: assert type(name)==str self.java_kbvv = gateway.entry_point.getJFMLKnowledgebaseVariable_Factory().createAnYaDataCloudType(name) elif name!=None and terms!=None: assert type(name)==str and type(terms)==list javalist_terms = ListConverter().convert(terms, gateway._gateway_client) self.java_kbv = gateway.entry_point.getJFMLKnowledgebaseVariable_Factory().createAnYaDataCloudType(name, javalist_terms)
def select(self, *cols): """ Selecting a set of expressions. >>> df.select('*').collect() [Row(age=2, name=u'Alice'), Row(age=5, name=u'Bob')] >>> df.select('name', 'age').collect() [Row(name=u'Alice', age=2), Row(name=u'Bob', age=5)] >>> df.select(df.name, (df.age + 10).alias('age')).collect() [Row(name=u'Alice', age=12), Row(name=u'Bob', age=15)] """ jcols = ListConverter().convert([_to_java_column(c) for c in cols], self._sc._gateway._gateway_client) jdf = self._jdf.select(self.sql_ctx._sc._jvm.PythonUtils.toSeq(jcols)) return DataFrame(jdf, self.sql_ctx)
def _jrdd(self): if self._jrdd_val: return self._jrdd_val if self._bypass_serializer: serializer = NoOpSerializer() else: serializer = self.ctx.serializer command = (self.func, self._prev_jrdd_deserializer, serializer) pickled_command = CloudPickleSerializer().dumps(command) broadcast_vars = ListConverter().convert( [x._jbroadcast for x in self.ctx._pickled_broadcast_vars], self.ctx._gateway._gateway_client) self.ctx._pickled_broadcast_vars.clear() class_tag = self._prev_jrdd.classTag() env = MapConverter().convert(self.ctx.environment, self.ctx._gateway._gateway_client) includes = ListConverter().convert(self.ctx._python_includes, self.ctx._gateway._gateway_client) python_rdd = self.ctx._jvm.PythonRDD(self._prev_jrdd.rdd(), bytearray(pickled_command), env, includes, self.preservesPartitioning, self.ctx.pythonExec, broadcast_vars, self.ctx._javaAccumulator, class_tag) self._jrdd_val = python_rdd.asJavaRDD() return self._jrdd_val
def countDistinct(col, *cols): """Returns a new :class:`Column` for distinct count of ``col`` or ``cols``. >>> df.agg(countDistinct(df.age, df.name).alias('c')).collect() [Row(c=2)] >>> df.agg(countDistinct("age", "name").alias('c')).collect() [Row(c=2)] """ sc = SparkContext._active_spark_context jcols = ListConverter().convert([_to_java_column(c) for c in cols], sc._gateway._gateway_client) jc = sc._jvm.functions.countDistinct(_to_java_column(col), sc._jvm.PythonUtils.toSeq(jcols)) return Column(jc)