def attach_distributed_sequence_column(sdf, column_name): """ This method attaches a Spark column that has a sequence in a distributed manner. This is equivalent to the column assigned when default index type 'distributed-sequence'. >>> sdf = ks.DataFrame(['a', 'b', 'c']).to_spark() >>> sdf = InternalFrame.attach_distributed_sequence_column(sdf, column_name="sequence") >>> sdf.show() # doctest: +NORMALIZE_WHITESPACE +--------+---+ |sequence| 0| +--------+---+ | 0| a| | 1| b| | 2| c| +--------+---+ """ if len(sdf.columns) > 0: try: jdf = sdf._jdf.toDF() sql_ctx = sdf.sql_ctx encoders = sql_ctx._jvm.org.apache.spark.sql.Encoders encoder = encoders.tuple(jdf.exprEnc(), encoders.scalaLong()) jrdd = jdf.localCheckpoint(False).rdd().zipWithIndex() df = spark.DataFrame( sql_ctx.sparkSession._jsparkSession.createDataset( jrdd, encoder).toDF(), sql_ctx) columns = df.columns return df.selectExpr( "`{}` as `{}`".format(columns[1], column_name), "`{}`.*".format(columns[0])) except py4j.protocol.Py4JError: if is_testing(): raise return InternalFrame._attach_distributed_sequence_column( sdf, column_name) else: cnt = sdf.count() if cnt > 0: return default_session().range(cnt).toDF(column_name) else: return default_session().createDataFrame( [], schema=StructType().add(column_name, data_type=LongType(), nullable=False))
def unique(self): sdf = self._sdf return DataFrame(spark.DataFrame(sdf._jdf.distinct(), sdf.sql_ctx), self._metadata.copy())