def jsonRDD(self, rdd): """Loads an RDD storing one JSON object per string, returning the result as a L{SchemaRDD}. It goes through the entire dataset once to determine the schema. >>> srdd = sqlCtx.jsonRDD(json) >>> sqlCtx.registerRDDAsTable(srdd, "table1") >>> srdd2 = sqlCtx.sql( ... "SELECT field1 AS f1, field2 as f2, field3 as f3, field6 as f4 from table1") >>> srdd2.collect() == [ ... {"f1":1, "f2":"row1", "f3":{"field4":11, "field5": None}, "f4":None}, ... {"f1":2, "f2":None, "f3":{"field4":22, "field5": [10, 11]}, "f4":[{"field7": "row2"}]}, ... {"f1":None, "f2":"row3", "f3":{"field4":33, "field5": []}, "f4":None}] True """ def func(split, iterator): for x in iterator: if not isinstance(x, basestring): x = unicode(x) yield x.encode("utf-8") keyed = PipelinedRDD(rdd, func) keyed._bypass_serializer = True jrdd = keyed._jrdd.map(self._jvm.BytesToString()) jschema_rdd = self._ssql_ctx.jsonRDD(jrdd.rdd()) return SchemaRDD(jschema_rdd, self)
def jsonRDD(self, rdd, schema=None): """Loads an RDD storing one JSON object per string as a L{SchemaRDD}. If the schema is provided, applies the given schema to this JSON dataset. Otherwise, it goes through the entire dataset once to determine the schema. >>> srdd1 = sqlCtx.jsonRDD(json) >>> sqlCtx.registerRDDAsTable(srdd1, "table1") >>> srdd2 = sqlCtx.sql( ... "SELECT field1 AS f1, field2 as f2, field3 as f3, field6 as f4 from table1") >>> srdd2.collect() == [ ... {"f1":1, "f2":"row1", "f3":{"field4":11, "field5": None}, "f4":None}, ... {"f1":2, "f2":None, "f3":{"field4":22, "field5": [10, 11]}, "f4":[{"field7": "row2"}]}, ... {"f1":None, "f2":"row3", "f3":{"field4":33, "field5": []}, "f4":None}] True >>> srdd3 = sqlCtx.jsonRDD(json, srdd1.schema()) >>> sqlCtx.registerRDDAsTable(srdd3, "table2") >>> srdd4 = sqlCtx.sql( ... "SELECT field1 AS f1, field2 as f2, field3 as f3, field6 as f4 from table2") >>> srdd4.collect() == [ ... {"f1":1, "f2":"row1", "f3":{"field4":11, "field5": None}, "f4":None}, ... {"f1":2, "f2":None, "f3":{"field4":22, "field5": [10, 11]}, "f4":[{"field7": "row2"}]}, ... {"f1":None, "f2":"row3", "f3":{"field4":33, "field5": []}, "f4":None}] True >>> schema = StructType([ ... StructField("field2", StringType(), True), ... StructField("field3", ... StructType([ ... StructField("field5", ArrayType(IntegerType(), False), True)]), False)]) >>> srdd5 = sqlCtx.jsonRDD(json, schema) >>> sqlCtx.registerRDDAsTable(srdd5, "table3") >>> srdd6 = sqlCtx.sql( ... "SELECT field2 AS f1, field3.field5 as f2, field3.field5[0] as f3 from table3") >>> srdd6.collect() == [ ... {"f1": "row1", "f2": None, "f3": None}, ... {"f1": None, "f2": [10, 11], "f3": 10}, ... {"f1": "row3", "f2": [], "f3": None}] True """ def func(split, iterator): for x in iterator: if not isinstance(x, basestring): x = unicode(x) yield x.encode("utf-8") keyed = PipelinedRDD(rdd, func) keyed._bypass_serializer = True jrdd = keyed._jrdd.map(self._jvm.BytesToString()) if schema is None: jschema_rdd = self._ssql_ctx.jsonRDD(jrdd.rdd()) else: scala_datatype = self._ssql_ctx.parseDataType(schema.__repr__()) jschema_rdd = self._ssql_ctx.jsonRDD(jrdd.rdd(), scala_datatype) return SchemaRDD(jschema_rdd, self)
def jsonRDD(self, rdd): """Loads an RDD storing one JSON object per string, returning the result as a L{SchemaRDD}. It goes through the entire dataset once to determine the schema. >>> srdd = sqlCtx.jsonRDD(json) >>> sqlCtx.registerRDDAsTable(srdd, "table1") >>> srdd2 = sqlCtx.sql("SELECT field1 AS f1, field2 as f2, field3 as f3 from table1") >>> srdd2.collect() == [{"f1": 1, "f2": "row1", "f3":{"field4":11}}, ... {"f1": 2, "f2": "row2", "f3":{"field4":22}}, ... {"f1": 3, "f2": "row3", "f3":{"field4":33}}] True """ def func(split, iterator): for x in iterator: if not isinstance(x, basestring): x = unicode(x) yield x.encode("utf-8") keyed = PipelinedRDD(rdd, func) keyed._bypass_serializer = True jrdd = keyed._jrdd.map(self._jvm.BytesToString()) jschema_rdd = self._ssql_ctx.jsonRDD(jrdd.rdd()) return SchemaRDD(jschema_rdd, self)
def mapPartitionsWithIndex(self, f, preservesPartitioning=False, transformation_ctx="", info="", stageThreshold=0, totalThreshold=0): return DynamicFrame( self.glue_ctx._jvm.DynamicFrame.fromPythonRDD( self._jdf, PipelinedRDD(self._rdd, f, preservesPartitioning)._jrdd, self.glue_ctx._ssql_ctx, transformation_ctx, self.name, _call_site(self._sc, callsite(), info), long(stageThreshold), long(totalThreshold)), self.glue_ctx, self.name)
def __call__(self, sparse_matrix: PipelinedRDD): """ Saves Cooccurrences asdf model to disk. :param sparse_matrix: rdd with 3 columns: matrix row, matrix column, cell value. Use :class:`.CooccConstructor` to construct RDD from uasts. :return: """ rows = sparse_matrix.collect() mat_row, mat_col, mat_weights = zip(*rows) tokens_num = len(self.tokens_list) self._log.info("Building matrix...") matrix = sparse.coo_matrix((mat_weights, (mat_row, mat_col)), shape=(tokens_num, tokens_num)) Cooccurrences() \ .construct(self.tokens_list, matrix) \ .save(self.output, deps=(self.df_model,))
def mapPartitionsWithIndex(self, f, preservesPartitioning=True): return DynamicFrame( self.glue_ctx._jvm.DynamicFrame.fromPythonRDD( PipelinedRDD(self._rdd, f, preservesPartitioning)._jrdd, self.glue_ctx._ssql_ctx), self.glue_ctx, self.name)
def mapPartitionsWithIndex(rdd, f, preservesPartitioning=False): """ Temporary function for barrier map partitions. """ return PipelinedRDD(rdd, f, preservesPartitioning, isFromBarrier=True)