Ejemplo n.º 1
0
Archivo: sql.py Proyecto: twneale/spark
    def jsonRDD(self, rdd):
        """Loads an RDD storing one JSON object per string, returning the result as a L{SchemaRDD}.
           It goes through the entire dataset once to determine the schema.

        >>> srdd = sqlCtx.jsonRDD(json)
        >>> sqlCtx.registerRDDAsTable(srdd, "table1")
        >>> srdd2 = sqlCtx.sql(
        ...   "SELECT field1 AS f1, field2 as f2, field3 as f3, field6 as f4 from table1")
        >>> srdd2.collect() == [
        ... {"f1":1, "f2":"row1", "f3":{"field4":11, "field5": None}, "f4":None},
        ... {"f1":2, "f2":None, "f3":{"field4":22,  "field5": [10, 11]}, "f4":[{"field7": "row2"}]},
        ... {"f1":None, "f2":"row3", "f3":{"field4":33, "field5": []}, "f4":None}]
        True
        """
        def func(split, iterator):
            for x in iterator:
                if not isinstance(x, basestring):
                    x = unicode(x)
                yield x.encode("utf-8")

        keyed = PipelinedRDD(rdd, func)
        keyed._bypass_serializer = True
        jrdd = keyed._jrdd.map(self._jvm.BytesToString())
        jschema_rdd = self._ssql_ctx.jsonRDD(jrdd.rdd())
        return SchemaRDD(jschema_rdd, self)
Ejemplo n.º 2
0
Archivo: sql.py Proyecto: joyyoj/spark
    def jsonRDD(self, rdd, schema=None):
        """Loads an RDD storing one JSON object per string as a L{SchemaRDD}.

        If the schema is provided, applies the given schema to this JSON dataset.
        Otherwise, it goes through the entire dataset once to determine the schema.

        >>> srdd1 = sqlCtx.jsonRDD(json)
        >>> sqlCtx.registerRDDAsTable(srdd1, "table1")
        >>> srdd2 = sqlCtx.sql(
        ...   "SELECT field1 AS f1, field2 as f2, field3 as f3, field6 as f4 from table1")
        >>> srdd2.collect() == [
        ... {"f1":1, "f2":"row1", "f3":{"field4":11, "field5": None}, "f4":None},
        ... {"f1":2, "f2":None, "f3":{"field4":22,  "field5": [10, 11]}, "f4":[{"field7": "row2"}]},
        ... {"f1":None, "f2":"row3", "f3":{"field4":33, "field5": []}, "f4":None}]
        True
        >>> srdd3 = sqlCtx.jsonRDD(json, srdd1.schema())
        >>> sqlCtx.registerRDDAsTable(srdd3, "table2")
        >>> srdd4 = sqlCtx.sql(
        ...   "SELECT field1 AS f1, field2 as f2, field3 as f3, field6 as f4 from table2")
        >>> srdd4.collect() == [
        ... {"f1":1, "f2":"row1", "f3":{"field4":11, "field5": None}, "f4":None},
        ... {"f1":2, "f2":None, "f3":{"field4":22,  "field5": [10, 11]}, "f4":[{"field7": "row2"}]},
        ... {"f1":None, "f2":"row3", "f3":{"field4":33, "field5": []}, "f4":None}]
        True
        >>> schema = StructType([
        ...     StructField("field2", StringType(), True),
        ...     StructField("field3",
        ...         StructType([
        ...             StructField("field5", ArrayType(IntegerType(), False), True)]), False)])
        >>> srdd5 = sqlCtx.jsonRDD(json, schema)
        >>> sqlCtx.registerRDDAsTable(srdd5, "table3")
        >>> srdd6 = sqlCtx.sql(
        ...   "SELECT field2 AS f1, field3.field5 as f2, field3.field5[0] as f3 from table3")
        >>> srdd6.collect() == [
        ... {"f1": "row1", "f2": None, "f3": None},
        ... {"f1": None, "f2": [10, 11], "f3": 10},
        ... {"f1": "row3", "f2": [], "f3": None}]
        True
        """
        def func(split, iterator):
            for x in iterator:
                if not isinstance(x, basestring):
                    x = unicode(x)
                yield x.encode("utf-8")

        keyed = PipelinedRDD(rdd, func)
        keyed._bypass_serializer = True
        jrdd = keyed._jrdd.map(self._jvm.BytesToString())
        if schema is None:
            jschema_rdd = self._ssql_ctx.jsonRDD(jrdd.rdd())
        else:
            scala_datatype = self._ssql_ctx.parseDataType(schema.__repr__())
            jschema_rdd = self._ssql_ctx.jsonRDD(jrdd.rdd(), scala_datatype)
        return SchemaRDD(jschema_rdd, self)
Ejemplo n.º 3
0
Archivo: sql.py Proyecto: 4Quant/spark
    def jsonRDD(self, rdd, schema=None):
        """Loads an RDD storing one JSON object per string as a L{SchemaRDD}.

        If the schema is provided, applies the given schema to this JSON dataset.
        Otherwise, it goes through the entire dataset once to determine the schema.

        >>> srdd1 = sqlCtx.jsonRDD(json)
        >>> sqlCtx.registerRDDAsTable(srdd1, "table1")
        >>> srdd2 = sqlCtx.sql(
        ...   "SELECT field1 AS f1, field2 as f2, field3 as f3, field6 as f4 from table1")
        >>> srdd2.collect() == [
        ... {"f1":1, "f2":"row1", "f3":{"field4":11, "field5": None}, "f4":None},
        ... {"f1":2, "f2":None, "f3":{"field4":22,  "field5": [10, 11]}, "f4":[{"field7": "row2"}]},
        ... {"f1":None, "f2":"row3", "f3":{"field4":33, "field5": []}, "f4":None}]
        True
        >>> srdd3 = sqlCtx.jsonRDD(json, srdd1.schema())
        >>> sqlCtx.registerRDDAsTable(srdd3, "table2")
        >>> srdd4 = sqlCtx.sql(
        ...   "SELECT field1 AS f1, field2 as f2, field3 as f3, field6 as f4 from table2")
        >>> srdd4.collect() == [
        ... {"f1":1, "f2":"row1", "f3":{"field4":11, "field5": None}, "f4":None},
        ... {"f1":2, "f2":None, "f3":{"field4":22,  "field5": [10, 11]}, "f4":[{"field7": "row2"}]},
        ... {"f1":None, "f2":"row3", "f3":{"field4":33, "field5": []}, "f4":None}]
        True
        >>> schema = StructType([
        ...     StructField("field2", StringType(), True),
        ...     StructField("field3",
        ...         StructType([
        ...             StructField("field5", ArrayType(IntegerType(), False), True)]), False)])
        >>> srdd5 = sqlCtx.jsonRDD(json, schema)
        >>> sqlCtx.registerRDDAsTable(srdd5, "table3")
        >>> srdd6 = sqlCtx.sql(
        ...   "SELECT field2 AS f1, field3.field5 as f2, field3.field5[0] as f3 from table3")
        >>> srdd6.collect() == [
        ... {"f1": "row1", "f2": None, "f3": None},
        ... {"f1": None, "f2": [10, 11], "f3": 10},
        ... {"f1": "row3", "f2": [], "f3": None}]
        True
        """
        def func(split, iterator):
            for x in iterator:
                if not isinstance(x, basestring):
                    x = unicode(x)
                yield x.encode("utf-8")
        keyed = PipelinedRDD(rdd, func)
        keyed._bypass_serializer = True
        jrdd = keyed._jrdd.map(self._jvm.BytesToString())
        if schema is None:
            jschema_rdd = self._ssql_ctx.jsonRDD(jrdd.rdd())
        else:
            scala_datatype = self._ssql_ctx.parseDataType(schema.__repr__())
            jschema_rdd = self._ssql_ctx.jsonRDD(jrdd.rdd(), scala_datatype)
        return SchemaRDD(jschema_rdd, self)
Ejemplo n.º 4
0
Archivo: sql.py Proyecto: 7472741/spark
    def jsonRDD(self, rdd):
        """Loads an RDD storing one JSON object per string, returning the result as a L{SchemaRDD}.
           It goes through the entire dataset once to determine the schema.

        >>> srdd = sqlCtx.jsonRDD(json)
        >>> sqlCtx.registerRDDAsTable(srdd, "table1")
        >>> srdd2 = sqlCtx.sql("SELECT field1 AS f1, field2 as f2, field3 as f3 from table1")
        >>> srdd2.collect() == [{"f1": 1, "f2": "row1", "f3":{"field4":11}},
        ...                     {"f1": 2, "f2": "row2", "f3":{"field4":22}},
        ...                     {"f1": 3, "f2": "row3", "f3":{"field4":33}}]
        True
        """
        def func(split, iterator):
            for x in iterator:
                if not isinstance(x, basestring):
                    x = unicode(x)
                yield x.encode("utf-8")
        keyed = PipelinedRDD(rdd, func)
        keyed._bypass_serializer = True
        jrdd = keyed._jrdd.map(self._jvm.BytesToString())
        jschema_rdd = self._ssql_ctx.jsonRDD(jrdd.rdd())
        return SchemaRDD(jschema_rdd, self)
Ejemplo n.º 5
0
 def mapPartitionsWithIndex(self,
                            f,
                            preservesPartitioning=False,
                            transformation_ctx="",
                            info="",
                            stageThreshold=0,
                            totalThreshold=0):
     return DynamicFrame(
         self.glue_ctx._jvm.DynamicFrame.fromPythonRDD(
             self._jdf,
             PipelinedRDD(self._rdd, f, preservesPartitioning)._jrdd,
             self.glue_ctx._ssql_ctx, transformation_ctx, self.name,
             _call_site(self._sc, callsite(), info), long(stageThreshold),
             long(totalThreshold)), self.glue_ctx, self.name)
Ejemplo n.º 6
0
    def __call__(self, sparse_matrix: PipelinedRDD):
        """
        Saves Cooccurrences asdf model to disk.

        :param sparse_matrix: rdd with 3 columns: matrix row, matrix column,  cell value. Use
            :class:`.CooccConstructor` to construct RDD from uasts.
        :return:
        """
        rows = sparse_matrix.collect()

        mat_row, mat_col, mat_weights = zip(*rows)
        tokens_num = len(self.tokens_list)

        self._log.info("Building matrix...")
        matrix = sparse.coo_matrix((mat_weights, (mat_row, mat_col)),
                                   shape=(tokens_num, tokens_num))
        Cooccurrences() \
            .construct(self.tokens_list, matrix) \
            .save(self.output, deps=(self.df_model,))
Ejemplo n.º 7
0
 def mapPartitionsWithIndex(self, f, preservesPartitioning=True):
     return DynamicFrame(
         self.glue_ctx._jvm.DynamicFrame.fromPythonRDD(
             PipelinedRDD(self._rdd, f, preservesPartitioning)._jrdd,
             self.glue_ctx._ssql_ctx), self.glue_ctx, self.name)
Ejemplo n.º 8
0
def mapPartitionsWithIndex(rdd, f, preservesPartitioning=False):
    """
    Temporary function for barrier map partitions.
    """
    return PipelinedRDD(rdd, f, preservesPartitioning, isFromBarrier=True)