def joinWithCassandraTable(dstream, keyspace, table, selected_columns=None, join_columns=None): """Joins a DStream (a stream of RDDs) with a Cassandra table Arguments: @param dstream(DStream) The DStream to join. Equals to self when invoking joinWithCassandraTable on a monkey patched RDD. @param keyspace(string): The keyspace to join on. @param table(string): The CQL table to join on. @param selected_columns(string): The columns to select from the Cassandra table. @param join_columns(string): The columns used to join on from the Cassandra table. """ ssc = dstream._ssc ctx = ssc._sc gw = ctx._gateway selected_columns = as_java_array(gw, "String", selected_columns) if selected_columns else None join_columns = as_java_array(gw, "String", join_columns) if join_columns else None h = helper(ctx) dstream = h.joinWithCassandraTable(dstream._jdstream, keyspace, table, selected_columns, join_columns) dstream = h.pickleRows(dstream) dstream = h.javaDStream(dstream) return DStream(dstream, ssc, AutoBatchedSerializer(PickleSerializer()))
def joinWithCassandraTable(dstream, keyspace, table, selected_columns=None, join_columns=None): """Joins a DStream (a stream of RDDs) with a Cassandra table Arguments: @param dstream(DStream) The DStream to join. Equals to self when invoking joinWithCassandraTable on a monkey patched RDD. @param keyspace(string): The keyspace to join on. @param table(string): The CQL table to join on. @param selected_columns(string): The columns to select from the Cassandra table. @param join_columns(string): The columns used to join on from the Cassandra table. """ ssc = dstream._ssc ctx = ssc._sc gw = ctx._gateway selected_columns = as_java_array( gw, "String", selected_columns) if selected_columns else None join_columns = as_java_array(gw, "String", join_columns) if join_columns else None h = helper(ctx) dstream = h.joinWithCassandraTable(dstream._jdstream, keyspace, table, selected_columns, join_columns) dstream = h.pickleRows(dstream) dstream = h.javaDStream(dstream) return DStream(dstream, ssc, AutoBatchedSerializer(PickleSerializer()))
def deleteFromCassandra(dstream, keyspace=None, table=None, deleteColumns=None, keyColumns=None, row_format=None, keyed=None, write_conf=None, **write_conf_kwargs): """Delete data from Cassandra table, using data from the RDD as primary keys. Uses the specified column names. Arguments: @param dstream(DStream) The DStream to join. Equals to self when invoking joinWithCassandraTable on a monkey patched RDD. @param keyspace(string):in The keyspace to save the RDD in. If not given and the rdd is a CassandraRDD the same keyspace is used. @param table(string): The CQL table to save the RDD in. If not given and the rdd is a CassandraRDD the same table is used. Keyword arguments: @param deleteColumns(iterable): The list of column names to delete, empty ColumnSelector means full row. @param keyColumns(iterable): The list of column names to delete, empty ColumnSelector means full row. @param row_format(RowFormat): Primary key columns selector, Optional. All RDD primary columns columns will be checked by default @param keyed(bool): Make explicit that the RDD consists of key, value tuples (and not arrays of length two). @param write_conf(WriteConf): A WriteConf object to use when saving to Cassandra @param **write_conf_kwargs: WriteConf parameters to use when saving to Cassandra """ ctx = dstream._ssc._sc gw = ctx._gateway # create write config as map write_conf = WriteConf.build(write_conf, **write_conf_kwargs) write_conf = as_java_object(gw, write_conf.settings()) # convert the columns to a string array deleteColumns = as_java_array(gw, "String", deleteColumns) if deleteColumns else None keyColumns = as_java_array(gw, "String", keyColumns) \ if keyColumns else None return helper(ctx).deleteFromCassandra(dstream._jdstream, keyspace, table, deleteColumns, keyColumns, row_format, keyed, write_conf)
def asDataFrames(self, *index_by): ''' Reads the spanned rows as DataFrames if pandas is available, or as a dict of numpy arrays if only numpy is available or as a dict with primitives and objects otherwise. @param index_by If pandas is available, the dataframes will be indexed by the given columns. ''' for c in index_by: if c in self.columns: raise ValueError( 'column %s cannot be used as index in the data' 'frames as it is a column by which the rows are spanned.') columns = as_java_array(self.ctx._gateway, "String", (str(c) for c in self.columns)) jrdd = self._helper.spanBy(self._crdd, columns) rdd = RDD(jrdd, self.ctx) global pd if index_by and pd: return rdd.mapValues( lambda _: _.set_index(*[str(c) for c in index_by])) else: return rdd
def where(self, clause, *args): """Creates a CassandraRDD with a CQL where clause applied. @param clause: The where clause, either complete or with ? markers @param *args: The parameters for the ? markers in the where clause. """ args = as_java_array(self.ctx._gateway, "Object", args) return self._specialize('where', *[clause, args])
def where(self, clause, *args): """Creates a CassandraRDD with a CQL where clause applied. @param clause: The where clause, either complete or with ? markers @param *args: The parameters for the ? markers in the where clause. """ args = as_java_array(self.ctx._gateway, "Object", args) return self._specialize('where', *[clause, args])
def saveToCassandra(rdd, keyspace=None, table=None, columns=None, row_format=None, keyed=None, write_conf=None, **write_conf_kwargs): ''' Saves an RDD to Cassandra. The RDD is expected to contain dicts with keys mapping to CQL columns. Arguments: @param rdd(RDD): The RDD to save. Equals to self when invoking saveToCassandra on a monkey patched RDD. @param keyspace(string):in The keyspace to save the RDD in. If not given and the rdd is a CassandraRDD the same keyspace is used. @param table(string): The CQL table to save the RDD in. If not given and the rdd is a CassandraRDD the same table is used. Keyword arguments: @param columns(iterable): The columns to save, i.e. which keys to take from the dicts in the RDD. If None given all columns are be stored. @param row_format(RowFormat): Make explicit how to map the RDD elements into Cassandra rows. If None given the mapping is auto-detected as far as possible. @param keyed(bool): Make explicit that the RDD consists of key, value tuples (and not arrays of length two). @param write_conf(WriteConf): A WriteConf object to use when saving to Cassandra @param **write_conf_kwargs: WriteConf parameters to use when saving to Cassandra ''' keyspace = keyspace or getattr(rdd, 'keyspace', None) if not keyspace: raise ValueError("keyspace not set") table = table or getattr(rdd, 'table', None) if not table: raise ValueError("table not set") # create write config as map write_conf = WriteConf.build(write_conf, **write_conf_kwargs) write_conf = as_java_object(rdd.ctx._gateway, write_conf.settings()) # convert the columns to a string array columns = as_java_array(rdd.ctx._gateway, "String", columns) if columns else None helper(rdd.ctx) \ .saveToCassandra( rdd._jrdd, keyspace, table, columns, row_format, keyed, write_conf, )
def saveToCassandra(dstream, keyspace, table, columns=None, row_format=None, keyed=None, write_conf=None, **write_conf_kwargs): ctx = dstream._ssc._sc gw = ctx._gateway # create write config as map write_conf = WriteConf.build(write_conf, **write_conf_kwargs) write_conf = as_java_object(gw, write_conf.settings()) # convert the columns to a string array columns = as_java_array(gw, "String", columns) if columns else None return helper(ctx).saveToCassandra(dstream._jdstream, keyspace, table, columns, row_format, keyed, write_conf)
def saveToCassandra(dstream, keyspace, table, columns=None, row_format=None, keyed=None, write_conf=None, **write_conf_kwargs): ctx = dstream._ssc._sc gw = ctx._gateway # create write config as map write_conf = WriteConf.build(write_conf, **write_conf_kwargs) write_conf = as_java_object(gw, write_conf.settings()) # convert the columns to a string array columns = as_java_array(gw, "String", columns) if columns else None return helper(ctx).saveToCassandra(dstream._jdstream, keyspace, table, columns, row_format, keyed, write_conf)
def asDataFrames(self, *index_by): ''' Reads the spanned rows as DataFrames if pandas is available, or as a dict of numpy arrays if only numpy is available or as a dict with primitives and objects otherwise. @param index_by If pandas is available, the dataframes will be indexed by the given columns. ''' for c in index_by: if c in self.columns: raise ValueError('column %s cannot be used as index in the data' 'frames as it is a column by which the rows are spanned.') columns = as_java_array(self.ctx._gateway, "String", (str(c) for c in self.columns)) jrdd = self._helper.spanBy(self._crdd, columns) rdd = RDD(jrdd, self.ctx) global pd if index_by and pd: return rdd.mapValues(lambda _: _.set_index(*[str(c) for c in index_by])) else: return rdd
def on(self, *columns): columns = as_java_array(self.ctx._gateway, "String", (str(c) for c in columns)) return self._specialize('on', columns)
def select(self, *columns): """Creates a CassandraRDD with the select clause applied.""" columns = as_java_array(self.ctx._gateway, "String", (str(c) for c in columns)) return self._specialize('select', columns)
def deleteFromCassandra(rdd, keyspace=None, table=None, deleteColumns=None, keyColumns=None, row_format=None, keyed=None, write_conf=None, **write_conf_kwargs): """ Delete data from Cassandra table, using data from the RDD as primary keys. Uses the specified column names. Arguments: @param rdd(RDD): The RDD to save. Equals to self when invoking saveToCassandra on a monkey patched RDD. @param keyspace(string):in The keyspace to save the RDD in. If not given and the rdd is a CassandraRDD the same keyspace is used. @param table(string): The CQL table to save the RDD in. If not given and the rdd is a CassandraRDD the same table is used. Keyword arguments: @param deleteColumns(iterable): The list of column names to delete, empty ColumnSelector means full row. @param keyColumns(iterable): The list of column names to delete, empty ColumnSelector means full row. @param row_format(RowFormat): Primary key columns selector, Optional. All RDD primary columns columns will be checked by default @param keyed(bool): Make explicit that the RDD consists of key, value tuples (and not arrays of length two). @param write_conf(WriteConf): A WriteConf object to use when saving to Cassandra @param **write_conf_kwargs: WriteConf parameters to use when saving to Cassandra """ keyspace = keyspace or getattr(rdd, 'keyspace', None) if not keyspace: raise ValueError("keyspace not set") table = table or getattr(rdd, 'table', None) if not table: raise ValueError("table not set") # create write config as map write_conf = WriteConf.build(write_conf, **write_conf_kwargs) write_conf = as_java_object(rdd.ctx._gateway, write_conf.settings()) # convert the columns to a string array deleteColumns = as_java_array(rdd.ctx._gateway, "String", deleteColumns) \ if deleteColumns else None keyColumns = as_java_array(rdd.ctx._gateway, "String", keyColumns) \ if keyColumns else None helper(rdd.ctx) \ .deleteFromCassandra( rdd._jrdd, keyspace, table, deleteColumns, keyColumns, row_format, keyed, write_conf, )
def on(self, *columns): columns = as_java_array(self.ctx._gateway, "String", (str(c) for c in columns)) return self._specialize('on', columns)
def select(self, *columns): """Creates a CassandraRDD with the select clause applied.""" columns = as_java_array(self.ctx._gateway, "String", (str(c) for c in columns)) return self._specialize('select', columns)