def storeDF(self, df: DataFrame, dfName: str, persistType: str, partitions: int, partitionCols: List[str]): ''' Store the input dataframe, read the persisted datafrme and return the new one. If Memory/Disk persistance requested, we run take(1) on the datafrme to force persist. ''' if self.__explainDF or \ "NULL|NONE".index(persistType.toUpperCase()) < 0 : self.log.info("Execution pland for building the DF '%s'" % (dfName)) df.explain() self.log.info("\n\n\n") saveType = self.__parms["--saveDFAs"] \ if self.__saveDF and \ "HIVE|NULL".index(persistType.toUpperCase()) < 0 \ else \ persistType.toUpperCase() if saveType == "S3" and self.__runEnv == "aws": saveType = "HDFS" self.log.debug( "Resetting the persist type to 'HDFS' as the --runEnv != 'aws'" ) df1 = df if saveType != "HDFS" and \ saveType != "HIVE" and \ saveType != "S3" \ else self.repartitionDF(dataFrame= df, partitions = partitions) if saveType == "NULL" or saveType == "NONE": return df1 elif saveType == "HDFS": return self.persistExternal(self.__tempHDFS, dfName, df, partitionCols) elif saveType == "S3": return self.persistExternal(self.__tempS3, dfName, df, partitionCols) elif saveType == "": return self.persist2Hive(dfName, df, partitionCols) elif saveType == "CHECK_POINT": return df.cache().checkpoint(eager=True) else: return self.persistLocal(dfName, df, persistType)
def storeDF(self, df: DataFrame, dfName: str, persistType: str, partitions: int, partitionCols: list[str]): ''' Store the input dataframe, read the persisted datafrme and return the new one. If Memory/Disk persistance requested, we run take(1) on the datafrme to force persist. ''' persistTyp = persistType.toUpperCase() if self.__explainDF and \ ( persistTyp not in ['NULL' , 'HIVE']) : self.log("\n\n\n") self.log("Execution plan for building the DF '%s' is," % (dfName)) df.explain() self.log("\n\n\n") saveTyp = self.__saveDF if self.__saveDF and persistTyp not in ['NULL' , 'HIVE'] \ else persistTyp df1 = df if saveTyp not in ["HDFS","HIVE","S3" ] \ else self.repartitionDF(dataFrame= df, partitions = partitions) if saveTyp in ["NULL", "NONE"]: return df1 elif saveTyp == "HDFS": return self.persistExternal(parentDirURI=self.__tempHDFS, fileName=dfName, df=df, partitionCols=partitionCols) elif saveTyp == "S3": return self.persistExternal(parentDirURI=self.__tempS3, fileName=dfName, df=df, partitionCols=partitionCols) elif saveTyp == "HIVE": return self.save2Hive(db=self.workDB, table=dfName, df=df, partitionCols=partitionCols) elif saveTyp == "CHECK_POINT": return df.cache().checkpoint(eager=True) else: return self.persistLocal(dfName=dfName, df=df, persistType=persistType)