def process_features(self, df, cols_by_type): """Process features before histogram filling. Specifically, in this case convert timestamp features to nanoseconds :param df: input data frame :return: output data frame with converted timestamp features :rtype: DataFrame """ # make alias df for value counting (used below) idf = df.alias('') # timestamp variables are converted here to ns since 1970-1-1 # histogrammar does not yet support long integers, so convert timestamps to float # epoch = (sparkcol("ts").cast("bigint") * 1000000000).cast("bigint") for col in cols_by_type["dt"]: self.logger.debug( 'Converting column "{col}" of type "{type}" to nanosec.'.format(col=col, type=self.var_dtype[col]) ) to_ns = (sparkcol(col).cast("float") * 1e9) idf = idf.withColumn(col, to_ns) hg.sparksql.addMethods(idf) return idf
def get_nunique(self, df, columns=[]): """return dict with number of unique entries for given columns :param df: input (spark) data frame :param columns: columns to select (optional) """ if not columns: columns = df.columns qdf = df.agg(*(approxCountDistinct(sparkcol(c)).alias(c) for c in columns)) return qdf.toPandas().T[0].to_dict()