コード例 #1
0
    def _jrdd(self):
        if self._jrdd_val:
            return self._jrdd_val
        func = self.func
        if not self._bypass_serializer and self.ctx.batchSize != 1:
            oldfunc = self.func
            batchSize = self.ctx.batchSize

            def batched_func(split, iterator):
                return batched(oldfunc(split, iterator), batchSize)

            func = batched_func
        cmds = [func, self._bypass_serializer]
        pipe_command = ' '.join(b64enc(cloudpickle.dumps(f)) for f in cmds)
        broadcast_vars = ListConverter().convert(
            [x._jbroadcast for x in self.ctx._pickled_broadcast_vars],
            self.ctx._gateway._gateway_client)
        self.ctx._pickled_broadcast_vars.clear()
        class_manifest = self._prev_jrdd.classManifest()
        env = MapConverter().convert(self.ctx.environment,
                                     self.ctx._gateway._gateway_client)
        includes = ListConverter().convert(self.ctx._python_includes,
                                           self.ctx._gateway._gateway_client)
        python_rdd = self.ctx._jvm.PythonRDD(
            self._prev_jrdd.rdd(), pipe_command, env, includes,
            self.preservesPartitioning, self.ctx.pythonExec, broadcast_vars,
            self.ctx._javaAccumulator, class_manifest)
        self._jrdd_val = python_rdd.asJavaRDD()
        return self._jrdd_val
コード例 #2
0
ファイル: common.py プロジェクト: zgj123123/BigDL
def _py2java(sc, obj):
    """ Convert Python object into Java """
    if isinstance(obj, RDD):
        obj = _to_java_object_rdd(obj)
    elif isinstance(obj, DataFrame):
        obj = obj._jdf
    elif isinstance(obj, SparkContext):
        obj = obj._jsc
    elif isinstance(obj, (list, tuple)):
        obj = ListConverter().convert([_py2java(sc, x) for x in obj],
                                      sc._gateway._gateway_client)
    elif isinstance(obj, dict):
        result = {}
        print(obj.keys())
        for (key, value) in obj.items():
            result[key] = _py2java(sc, value)
        obj = MapConverter().convert(result, sc._gateway._gateway_client)
    elif isinstance(obj, JavaValue):
        obj = obj.value
    elif isinstance(obj, JavaObject):
        pass
    elif isinstance(obj, (int, long, float, bool, bytes, unicode)):
        pass
    else:
        data = bytearray(PickleSerializer().dumps(obj))
        obj = sc._jvm.org.apache.spark.bigdl.api.python.BigDLSerDe.loads(data)
    return obj
コード例 #3
0
    def jvertex_rdd(self):
        if self.jvrdd_val:
            return self.jvrdd_val
        if self.bypass_serializer:
            self.jvertex_rdd_deserializer = NoOpSerializer()
        # enable_profile = self.ctx._conf.get("spark.python.profile", "false") == "true"
        # profileStats = self.ctx.accumulator(None, PStatsParam) if enable_profile else None
        command = (self.func, profileStats, self.prev_jvertex_rdd_deserializer,
                   self.jvertex_rdd_deserializer)
        # the serialized command will be compressed by broadcast
        ser = CloudPickleSerializer()
        pickled_command = ser.dumps(command)
        if len(pickled_command) > (1 << 20):  # 1M
            self.broadcast = self.ctx.broadcast(pickled_command)
            pickled_command = ser.dumps(self.broadcast)
        broadcast_vars = ListConverter().convert(
            [x._jbroadcast for x in self.ctx._pickled_broadcast_vars],
            self.ctx._gateway._gateway_client)
        self.ctx._pickled_broadcast_vars.clear()
        env = MapConverter().convert(self.ctx.environment,
                                     self.ctx._gateway._gateway_client)
        includes = ListConverter().convert(self.ctx._python_includes,
                                           self.ctx._gateway._gateway_client)
        java_storage_level = self.ctx._getJavaStorageLevel(
            StorageLevel.MEMORY_ONLY)
        python_rdd = self.ctx._jvm.PythonVertexRDD(
            self.prev_jvertex_rdd, bytearray(pickled_command), env, includes,
            self.preservesPartitioning, self.ctx.pythonExec, broadcast_vars,
            self.ctx._javaAccumulator, java_storage_level)
        self.jvrdd_val = python_rdd.asJavaVertexRDD()

        if enable_profile:
            self.id = self.jvrdd_val.id()
            self.ctx._add_profile(self.id, profileStats)
        return self.jvrdd_val
コード例 #4
0
def saveAndOpenSelection(df, dsName:str, objectClassIdx:int, selectionName:str, showObjects:bool=False, showTracks:bool=False, openSelection:bool=False, objectClassIdxDisplay:int=-1, interactiveObjectClassIdx:int=-1, port=25335, python_proxy_port:int=25334, address='127.0.0.1', gateway_parameters={}):
    """Stores a selection to bacmman using python gateway (py4j). Bacmman must be running with an active python gateway server.

    Parameters
    ----------
    df : pandas DataFrame
        each line of the DataFrame is one element of the selection, defined by columns Indices & Position
    dsName : str
        bacmman dataset name to store the selection to.
    objectClassIdx : int
        index of the object class of the elements of the selection in the bacmman dataset
    selectionName : str
        name of the selection
    showObjects : bool
        whether contours of objects should be shown
    showTracks : bool
        whether track links of objects should be shown
    openSelection : bool
        whether the first kymograph of the selection should be open
    objectClassIdxDisplay : int
        if openSelection is true, object class idx of the opened kymograph
    interactiveObjectClassIdx : int
        if openSelection is true, interactive object class idx
    python_proxy_port : int
        python port of the java gateway
    """
    gateway = JavaGateway(python_proxy_port=python_proxy_port, gateway_parameters=GatewayParameters(address=address, port=port, **gateway_parameters))
    try:
        idx = ListConverter().convert(df.Indices.tolist(), gateway._gateway_client)
        pos = ListConverter().convert(df.Position.tolist(), gateway._gateway_client)
        gateway.saveCurrentSelection(dsName, objectClassIdx, selectionName, idx, pos, showObjects, showTracks, openSelection, False, objectClassIdxDisplay, interactiveObjectClassIdx)
    except Py4JNetworkError:
        print("Could not connect, is BACMMAN started?")
コード例 #5
0
ファイル: common.py プロジェクト: Kim-Seongjung/BigDL
def _py2java(sc, obj):
    """ Convert Python object into Java """
    if isinstance(obj, RDD):
        obj = _to_java_object_rdd(obj)
    elif isinstance(obj, DataFrame):
        obj = obj._jdf
    elif isinstance(obj, SparkContext):
        obj = obj._jsc
    elif isinstance(obj, (list, tuple)):
        obj = ListConverter().convert([_py2java(sc, x) for x in obj],
                                      sc._gateway._gateway_client)
    elif isinstance(obj, dict):
        result = {}
        for (key, value) in obj.iteritems():
            result[key] = _py2java(sc, value) if isinstance(value, JavaValue) else value  # noqa
        obj = result

    elif isinstance(obj, JavaValue):
        obj = obj.value
    elif isinstance(obj, JavaObject):
        pass
    elif isinstance(obj, (int, long, float, bool, bytes, unicode)):
        pass
    else:
        data = bytearray(PickleSerializer().dumps(obj))
        obj = sc._jvm.org.apache.spark.bigdl.api.python.BigDLSerDe.loads(data)
    return obj
コード例 #6
0
 def _jdstream(self):
     if self._jdstream_val:
         return self._jdstream_val
     if self._bypass_serializer:
         self.jrdd_deserializer = NoOpSerializer()
     command = (self.func, self._prev_jrdd_deserializer,
                self._jrdd_deserializer)
     # the serialized command will be compressed by broadcast
     ser = CloudPickleSerializer()
     pickled_command = ser.dumps(command)
     if pickled_command > (1 << 20):  # 1M
         broadcast = self.ctx.broadcast(pickled_command)
         pickled_command = ser.dumps(broadcast)
     broadcast_vars = ListConverter().convert(
         [x._jbroadcast for x in self.ctx._pickled_broadcast_vars],
         self.ctx._gateway._gateway_client)
     self.ctx._pickled_broadcast_vars.clear()
     class_tag = self._prev_jdstream.classTag()
     env = MapConverter().convert(self.ctx.environment,
                                  self.ctx._gateway._gateway_client)
     includes = ListConverter().convert(self.ctx._python_includes,
                                        self.ctx._gateway._gateway_client)
     python_dstream = self.ctx._jvm.PythonDStream(
         self._prev_jdstream.dstream(), bytearray(pickled_command), env,
         includes, self.preservesPartitioning, self.ctx.pythonExec,
         broadcast_vars, self.ctx._javaAccumulator, class_tag)
     self._jdstream_val = python_dstream.asJavaDStream()
     return self._jdstream_val
コード例 #7
0
ファイル: AnYaDataCloudType.py プロジェクト: mrektor/py4jfml
 def setTerms(self, datum):
     '''
     :param datum: list of Double
     '''
     assert type(datum)==list
     javalist_datum = ListConverter().convert(datum, gateway._gateway_client)
     self.java_kbv.setTerms(javalist_datum)
コード例 #8
0
 def _api(self, *args):
     jargs = ListConverter().convert(
         args, self.sql_ctx._sc._gateway._gateway_client)
     name = f.__name__
     jdf = getattr(self._jdf,
                   name)(self.sql_ctx._sc._jvm.PythonUtils.toSeq(jargs))
     return DataFrame(jdf, self.sql_ctx)
コード例 #9
0
ファイル: context.py プロジェクト: mkolod/incubator-spark
    def union(self, rdds):
        """
        Build the union of a list of RDDs.

        This supports unions() of RDDs with different serialized formats,
        although this forces them to be reserialized using the default
        serializer:

        >>> path = os.path.join(tempdir, "union-text.txt")
        >>> with open(path, "w") as testFile:
        ...    testFile.write("Hello")
        >>> textFile = sc.textFile(path)
        >>> textFile.collect()
        [u'Hello']
        >>> parallelized = sc.parallelize(["World!"])
        >>> sorted(sc.union([textFile, parallelized]).collect())
        [u'Hello', 'World!']
        """
        first_jrdd_deserializer = rdds[0]._jrdd_deserializer
        if any(x._jrdd_deserializer != first_jrdd_deserializer for x in rdds):
            rdds = [x._reserialize() for x in rdds]
        first = rdds[0]._jrdd
        rest = [x._jrdd for x in rdds[1:]]
        rest = ListConverter().convert(rest, self._gateway._gateway_client)
        return RDD(self._jsc.union(first, rest), self,
                   rdds[0]._jrdd_deserializer)
コード例 #10
0
def genTunnel(mapdata):
	'''Takes /map topic data and generates a tunnel in the correct
	format for the Sentel navigation engine. I see no easy way
	modifying explorer as a global as this must return void. The
	only other possibility would be to publish this to a new topic
	and path plan in another subscription service. We will do this
	if we notice data is getting corrupted or the state of explorer
	is unclear'''
	global explorer,ways,pubway,rtl_pub
	print('I have the nav_map')
	size = mapdata.info.width
	data = mapdata.data
	explorer.initializeArray(size)
	explorer.importHectorList(ListConverter().convert(data,gateway._gateway_client),1)
	shortestdistance = 3
	maxrange = 40

	p = explorer.findClosestFrontier(size/2,size/2,maxrange,shortestdistance)
	ways = Waypoints()
	x_ways, y_ways = [], []
	if p is None:
		rospy.loginfo("No path found! Go home!.")
		rtl_pub.publish(True)
		return
	for pp in p:
		x_ways.append(int(pp.getX()))  #Possible tranpose, but map_viewer works currently
		y_ways.append(int(pp.getY()))
	ways.x = x_ways
	ways.y = y_ways
	pubway.publish(ways) #publishing x,y gridpoints
コード例 #11
0
ファイル: context.py プロジェクト: a770606860/database
    def runJob(self, rdd, partitionFunc, partitions=None, allowLocal=False):
        """
        Executes the given partitionFunc on the specified set of partitions,
        returning the result as an array of elements.

        If 'partitions' is not specified, this will run over all partitions.

        >>> myRDD = sc.parallelize(range(6), 3)
        >>> sc.runJob(myRDD, lambda part: [x * x for x in part])
        [0, 1, 4, 9, 16, 25]

        >>> myRDD = sc.parallelize(range(6), 3)
        >>> sc.runJob(myRDD, lambda part: [x * x for x in part], [0, 2], True)
        [0, 1, 16, 25]
        """
        if partitions is None:
            partitions = range(rdd._jrdd.partitions().size())
        javaPartitions = ListConverter().convert(partitions,
                                                 self._gateway._gateway_client)

        # Implementation note: This is implemented as a mapPartitions followed
        # by runJob() in order to avoid having to pass a Python lambda into
        # SparkContext#runJob.
        mappedRDD = rdd.mapPartitions(partitionFunc)
        it = self._jvm.PythonRDD.runJob(self._jsc.sc(), mappedRDD._jrdd,
                                        javaPartitions, allowLocal)
        return list(mappedRDD._collect_iterator_through_file(it))
コード例 #12
0
ファイル: dataframe.py プロジェクト: lorenzfischer/spark
    def agg(self, *exprs):
        """ Compute aggregates by specifying a map from column name
        to aggregate methods.

        The available aggregate methods are `avg`, `max`, `min`,
        `sum`, `count`.

        :param exprs: list or aggregate columns or a map from column
                      name to aggregate methods.

        >>> gdf = df.groupBy(df.name)
        >>> gdf.agg({"*": "count"}).collect()
        [Row(name=u'Bob', COUNT(1)=1), Row(name=u'Alice', COUNT(1)=1)]

        >>> from pyspark.sql import functions as F
        >>> gdf.agg(F.min(df.age)).collect()
        [Row(MIN(age#0)=5), Row(MIN(age#0)=2)]
        """
        assert exprs, "exprs should not be empty"
        if len(exprs) == 1 and isinstance(exprs[0], dict):
            jmap = MapConverter().convert(
                exprs[0], self.sql_ctx._sc._gateway._gateway_client)
            jdf = self._jdf.agg(jmap)
        else:
            # Columns
            assert all(isinstance(c, Column)
                       for c in exprs), "all exprs should be Column"
            jcols = ListConverter().convert(
                [c._jc for c in exprs[1:]],
                self.sql_ctx._sc._gateway._gateway_client)
            jdf = self._jdf.agg(exprs[0]._jc,
                                self.sql_ctx._sc._jvm.PythonUtils.toSeq(jcols))
        return DataFrame(jdf, self.sql_ctx)
コード例 #13
0
ファイル: analyse.py プロジェクト: chambai/deepstreamce
def addToClusterer(stream, javaFile, ids):
    # get data into java
    jIds = ListConverter().convert(ids, gateway._gateway_client)

    util.thisLogger.logInfo(javaFile)
    gateway.entry_point.Moa_Clusterers_Outliers_Mcod_AddCsvDataToStream(
        stream, javaFile, jIds)
コード例 #14
0
ファイル: analyse.py プロジェクト: chambai/deepstreamce
def getAttributeNames(flatActivations):
    # create the attribute names for the flat data
    attributeNames = [str(i) for i in range(0, len(flatActivations[0]))]
    # this is the first time we attemp to connect to moa gateway, check the connection
    checkConnection()
    jAttributeNames = ListConverter().convert(attributeNames,
                                              gateway._gateway_client)
    return jAttributeNames
コード例 #15
0
ファイル: context.py プロジェクト: fightingBilling/spark-1
 def union(self, rdds):
     """
     Build the union of a list of RDDs.
     """
     first = rdds[0]._jrdd
     rest = [x._jrdd for x in rdds[1:]]
     rest = ListConverter().convert(rest, self.gateway._gateway_client)
     return RDD(self._jsc.union(first, rest), self)
コード例 #16
0
    def fillna(self, value, subset=None):
        """Replace null values, alias for ``na.fill()``.

        :param value: int, long, float, string, or dict.
            Value to replace null values with.
            If the value is a dict, then `subset` is ignored and `value` must be a mapping
            from column name (string) to replacement value. The replacement value must be
            an int, long, float, or string.
        :param subset: optional list of column names to consider.
            Columns specified in subset that do not have matching data type are ignored.
            For example, if `value` is a string, and subset contains a non-string column,
            then the non-string column is simply ignored.

        >>> df4.fillna(50).show()
        age height name
        10  80     Alice
        5   50     Bob
        50  50     Tom
        50  50     null

        >>> df4.fillna({'age': 50, 'name': 'unknown'}).show()
        age height name
        10  80     Alice
        5   null   Bob
        50  null   Tom
        50  null   unknown

        >>> df4.na.fill({'age': 50, 'name': 'unknown'}).show()
        age height name
        10  80     Alice
        5   null   Bob
        50  null   Tom
        50  null   unknown
        """
        if not isinstance(value, (float, int, long, basestring, dict)):
            raise ValueError(
                "value should be a float, int, long, string, or dict")

        if isinstance(value, (int, long)):
            value = float(value)

        if isinstance(value, dict):
            value = MapConverter().convert(
                value, self.sql_ctx._sc._gateway._gateway_client)
            return DataFrame(self._jdf.na().fill(value), self.sql_ctx)
        elif subset is None:
            return DataFrame(self._jdf.na().fill(value), self.sql_ctx)
        else:
            if isinstance(subset, basestring):
                subset = [subset]
            elif not isinstance(subset, (list, tuple)):
                raise ValueError(
                    "subset should be a list or tuple of column names")

            cols = ListConverter().convert(
                subset, self.sql_ctx._sc._gateway._gateway_client)
            cols = self.sql_ctx._sc._jvm.PythonUtils.toSeq(cols)
            return DataFrame(self._jdf.na().fill(value, cols), self.sql_ctx)
コード例 #17
0
def generate_xlog(traces_id, activities, groups, times):
    # Convert lists to Java compatible format
    traces_id_java = ListConverter().convert(traces_id,
                                             gateway._gateway_client)

    activities_java = []
    for activity in activities:
        activities_java.append(ListConverter().convert(
            activity, gateway._gateway_client))
    activities_java = ListConverter().convert(activities_java,
                                              gateway._gateway_client)

    groups_java = []
    for group in groups:
        groups_java.append(ListConverter().convert(group,
                                                   gateway._gateway_client))
    groups_java = ListConverter().convert(groups_java, gateway._gateway_client)

    times_java = []
    for time in times:
        times_java.append(ListConverter().convert(time,
                                                  gateway._gateway_client))
    times_java = ListConverter().convert(times_java, gateway._gateway_client)

    verificator_app.generateXLog(traces_id_java, activities_java, groups_java,
                                 times_java)
コード例 #18
0
    def getSupportedFiletypes(self):
        types = self.get_supported_filetypes_grouped()
        converted = {}
        for k, v in types.items():
            converted[k] = ListConverter().convert(
                v, Gateway.gateway._gateway_client)

        return MapConverter().convert(converted,
                                      Gateway.gateway._gateway_client)
コード例 #19
0
 def getAllDevices(self, ids=None):
     devmap = None
     if ids == None:
         devmap = self.entry.getAllDevices()
     else:
         java_list = ListConverter().convert(ids,
                                             self.gateway._gateway_client)
         devmap = self.entry.getAllDevices(java_list)
     return devmap
コード例 #20
0
    def getAxisLimits(self):
        limits = list()

        for ax in self.figure.axes:
            if ax.lines or ax.collections or ax.patches:  # skip "empty" axes
                limits += [*ax.get_xlim(), *ax.get_ylim()]

        java_list = ListConverter().convert(limits,
                                            Gateway.gateway._gateway_client)
        return java_list
コード例 #21
0
    def get_item_based_recommendations(self, paper_id_list):
        java_paper_id_list = ListConverter().convert(
            paper_id_list, self.gateway._gateway_client)
        recs = self.gateway.entry_point.recommend(java_paper_id_list)
        res = []
        for rec in recs:
            r = rec.split(',')
            res.append({'id': r[0], 'score': float(r[1])})

        return res
コード例 #22
0
    def selectExpr(self, *expr):
        """
        Selects a set of SQL expressions. This is a variant of
        `select` that accepts SQL expressions.

        >>> df.selectExpr("age * 2", "abs(age)").collect()
        [Row((age * 2)=4, Abs(age)=2), Row((age * 2)=10, Abs(age)=5)]
        """
        jexpr = ListConverter().convert(expr, self._sc._gateway._gateway_client)
        jdf = self._jdf.selectExpr(self._sc._jvm.PythonUtils.toSeq(jexpr))
        return DataFrame(jdf, self.sql_ctx)
コード例 #23
0
 def tag_token(self, token: str) -> [str]:
     """
     Tag a single token using the tagger of a language. All valid tags for the token are returned.
     """
     tokens = ListConverter().convert([token], self.gateway._gateway_client)
     tags = list(
         set(
             LanguageTool._get_tags_of_tagged_tokens(
                 self.tagger.tag(tokens)[0])))
     tags.sort()  # hm, "Zusammenhänge" gets tags more than once
     return tags
コード例 #24
0
ファイル: dataframe.py プロジェクト: szheng79/spark
def _to_seq(sc, cols, converter=None):
    """
    Convert a list of Column (or names) into a JVM Seq of Column.

    An optional `converter` could be used to convert items in `cols`
    into JVM Column objects.
    """
    if converter:
        cols = [converter(c) for c in cols]
    jcols = ListConverter().convert(cols, sc._gateway._gateway_client)
    return sc._jvm.PythonUtils.toSeq(jcols)
コード例 #25
0
    def getVectorOps(self):
        from omnetpp.scave import vectorops

        ops = []
        # ctor is: VectorOp(String module, String name, String signature, String docstring, String label, String example)
        for o in vectorops._report_ops():
            try:
                ops.append(Gateway.gateway.jvm.org.omnetpp.scave.editors.VectorOperations.VectorOp(*o))
            except Exception as E:
                print("Exception while processing vector operation:", o[4])

        return ListConverter().convert(ops, Gateway.gateway._gateway_client)
コード例 #26
0
ファイル: sequence.py プロジェクト: shazbots/keanu
    def __init__(self,
                 factories: Union[Callable[..., None], Iterable[Callable[..., None]]] = None,
                 count: int = None,
                 name: str = None,
                 data_generator: Iterator[Dict[str, Any]] = None,
                 initial_state: Dict[str, vertex_constructor_param_types] = None):

        builder = k.jvm_view().SequenceBuilder().named(name)

        if initial_state is not None:
            initial_state_java = k.to_java_map(
                {_VertexLabel(k): cast_to_double_vertex(v).unwrap() for (k, v) in initial_state.items()})
            vertex_dictionary = k.jvm_view().SimpleVertexDictionary.backedBy(initial_state_java)
            builder = builder.withInitialState(vertex_dictionary)

        if count is None and data_generator is None:
            raise ValueError(
                "Cannot create a sequence of an unknown size: you must specify either a count of a data_generator")
        elif count is not None and data_generator is not None:
            raise ValueError("If you pass in a data_generator you cannot also pass in a count")
        elif factories is None:
            raise ValueError("You must provide a value for the 'factories' input")

        if not isinstance(factories, CollectionsIterable):
            factories = [factories]

        if count is not None:
            functions = [Consumer(partial(lambda f, p: f(SequenceItem(p)), f)) for f in factories]
            java_functions = ListConverter().convert(functions, k._gateway._gateway_client)
            builder = builder.count(count).withFactories(java_functions)

        if data_generator is not None:
            bifunctions = [BiConsumer(partial(lambda f, p, data: f(SequenceItem(p), data), f)) for f in factories]
            java_bifunctions = ListConverter().convert(bifunctions, k._gateway._gateway_client)
            data_generator_java = (k.to_java_map(m) for m in data_generator)
            builder = builder.fromIterator(JavaIterator(data_generator_java)).withFactories(java_bifunctions)

        sequence = builder.build()
        super().__init__(sequence)
コード例 #27
0
    def registerFunction(self, name, f, returnType="string"):
        def func(split, iterator):
            return imap(f, iterator)

        command = (func, self._sc.serializer, self._sc.serializer)
        env = MapConverter().convert(self._sc.environment,
                                     self._sc._gateway._gateway_client)
        includes = ListConverter().convert(self._sc._python_includes,
                                           self._sc._gateway._gateway_client)
        self._ssql_ctx.registerPython(
            name, bytearray(CloudPickleSerializer().dumps(command)), env,
            includes, self._sc.pythonExec, self._sc._javaAccumulator,
            returnType)
コード例 #28
0
ファイル: AnYaDataCloudType.py プロジェクト: mrektor/py4jfml
 def __init__(self, name=None, terms=None):
     '''
     :param name: the name
     :param terms: list of Double
     '''
     if name==None and terms==None:
         self.java_kbv = gateway.entry_point.getJFMLKnowledgebaseVariable_Factory().createAnYaDataCloudType()
     elif name!=None and terms==None:
         assert type(name)==str
         self.java_kbvv = gateway.entry_point.getJFMLKnowledgebaseVariable_Factory().createAnYaDataCloudType(name)
     elif name!=None and terms!=None:
         assert type(name)==str and type(terms)==list
         javalist_terms = ListConverter().convert(terms, gateway._gateway_client)
         self.java_kbv = gateway.entry_point.getJFMLKnowledgebaseVariable_Factory().createAnYaDataCloudType(name, javalist_terms)
コード例 #29
0
ファイル: dataframe.py プロジェクト: sisihj/spark
    def select(self, *cols):
        """ Selecting a set of expressions.

        >>> df.select('*').collect()
        [Row(age=2, name=u'Alice'), Row(age=5, name=u'Bob')]
        >>> df.select('name', 'age').collect()
        [Row(name=u'Alice', age=2), Row(name=u'Bob', age=5)]
        >>> df.select(df.name, (df.age + 10).alias('age')).collect()
        [Row(name=u'Alice', age=12), Row(name=u'Bob', age=15)]
        """
        jcols = ListConverter().convert([_to_java_column(c) for c in cols],
                                        self._sc._gateway._gateway_client)
        jdf = self._jdf.select(self.sql_ctx._sc._jvm.PythonUtils.toSeq(jcols))
        return DataFrame(jdf, self.sql_ctx)
コード例 #30
0
ファイル: rdd.py プロジェクト: xoltar/spark
 def _jrdd(self):
     if self._jrdd_val:
         return self._jrdd_val
     if self._bypass_serializer:
         serializer = NoOpSerializer()
     else:
         serializer = self.ctx.serializer
     command = (self.func, self._prev_jrdd_deserializer, serializer)
     pickled_command = CloudPickleSerializer().dumps(command)
     broadcast_vars = ListConverter().convert(
         [x._jbroadcast for x in self.ctx._pickled_broadcast_vars],
         self.ctx._gateway._gateway_client)
     self.ctx._pickled_broadcast_vars.clear()
     class_tag = self._prev_jrdd.classTag()
     env = MapConverter().convert(self.ctx.environment,
                                  self.ctx._gateway._gateway_client)
     includes = ListConverter().convert(self.ctx._python_includes,
                                  self.ctx._gateway._gateway_client)
     python_rdd = self.ctx._jvm.PythonRDD(self._prev_jrdd.rdd(),
         bytearray(pickled_command), env, includes, self.preservesPartitioning,
         self.ctx.pythonExec, broadcast_vars, self.ctx._javaAccumulator,
         class_tag)
     self._jrdd_val = python_rdd.asJavaRDD()
     return self._jrdd_val
コード例 #31
0
ファイル: functions.py プロジェクト: bopopescu/spark-14
def countDistinct(col, *cols):
    """Returns a new :class:`Column` for distinct count of ``col`` or ``cols``.

    >>> df.agg(countDistinct(df.age, df.name).alias('c')).collect()
    [Row(c=2)]

    >>> df.agg(countDistinct("age", "name").alias('c')).collect()
    [Row(c=2)]
    """
    sc = SparkContext._active_spark_context
    jcols = ListConverter().convert([_to_java_column(c) for c in cols],
                                    sc._gateway._gateway_client)
    jc = sc._jvm.functions.countDistinct(_to_java_column(col),
                                         sc._jvm.PythonUtils.toSeq(jcols))
    return Column(jc)