def write_evidence_strings(evidence_df: DataFrame, output_file: str) -> None:
    """
    Exports the table to a compressed JSON file containing the evidence strings.
    Pandas is used to export it to a single file, not a directory.
    """
    evidence_df.coalesce(1).write.format('json').mode('overwrite').option(
        'compression', 'gzip').save(output_file)
    return 0
Beispiel #2
0
    def repartitionDF(self, df: DataFrame, partitions: int = 0):
        '''
            Repartition the inuput dataframe

            parms: df          -> dataframe
                   partitions  -> new partitions count. Defaulted to 0 i.e Don't partition

            logic,
                if partitions = 0 , Don't repartitions
                if partitions = -1, Repartions to the default number (NumOfExecutors * ExecutorCores * 2)
                if partitions > 0 , Repartition/coalesce to the input number
        '''
        curParts = df.rdd.getNumPartitions
        finalParts = min(curParts, partitions)

        if curParts == partitions or partitions == 0:
            finalParts = -1
        elif partitions == -1:
            finalParts = self.__dfltRDDParts
        elif partitions > 0:
            finalParts = partitions
        else:
            pass  #finalParts is pre-populated.

        self.log("Current Partitions: %d , Requested: %d,  Final: %d " %
                 (curParts, partitions, finalParts))

        if finalParts != -1:
            return df
        elif curParts > finalParts:
            return df.coalesce(finalParts)
        else:
            return df.repartition(finalParts)
Beispiel #3
0
 def write(self, feature_set: FeatureSet, dataframe: DataFrame,
           spark_client: SparkClient) -> Any:
     """Write output to single file CSV dataset."""
     path = f"data/datasets/{feature_set.name}"
     spark_client.write_dataframe(
         dataframe=dataframe.coalesce(1),
         format_="csv",
         mode="overwrite",
         path=path,
         header=True,
     )
 def writeFile(df:DataFrame, filepath:str, filetype:str) -> None:
     if isinstance(df, DataFrame):
         getattr(df.coalesce(1).write,filetype)(filepath, mode="overwrite", header="true") # csv, 1 file, with header
Beispiel #5
0
def _save_parquet_local(spark_df: Spark_df, fpath: str) -> Sequence[str]:
    spark_df.coalesce(1).write.parquet(fpath)
    return glob.glob(f"{fpath}/*.parquet")