Example #1
0
    def _create_table_or_temp_view_from_csv(self,
                                            name,
                                            path,
                                            schema=None,
                                            database=None,
                                            force=False,
                                            temp_view=False,
                                            format='parquet',
                                            **kwargs):
        options = _read_csv_defaults.copy()
        options.update(kwargs)

        if schema:
            assert ('inferSchema', True) not in options.items()
            schema = spark_dtype(schema)
            options['schema'] = schema
        else:
            options['inferSchema'] = True

        df = self._session.read.csv(path, **options)

        if temp_view:
            if force:
                df.createOrReplaceTempView(name)
            else:
                df.createTempView(name)
        else:
            qualified_name = _fully_qualified_name(
                name, database or self.current_database)
            mode = 'error'
            if force:
                mode = 'overwrite'
            df.write.saveAsTable(qualified_name, format=format, mode=mode)
Example #2
0
def compile_reduction_udf(t, expr, scope, timecontext, context=None, **kwargs):
    op = expr.op()

    spark_output_type = spark_dtype(op._output_type)
    spark_udf = pandas_udf(
        op.func, spark_output_type, PandasUDFType.GROUPED_AGG
    )
    func_args = (t.translate(arg, scope, timecontext) for arg in op.func_args)

    col = spark_udf(*func_args)
    if context in (AggregationContext.ENTIRE, AggregationContext.GROUP):
        return col
    elif context == AggregationContext.WINDOW:
        window = kwargs['window']
        return col.over(window)
    else:
        src_table = t.translate(op.func_args[0].op().table, scope, timecontext)
        return src_table.agg(col)
Example #3
0
 def __init__(self, input_type, output_type):
     self.input_type = list(map(dt.dtype, input_type))
     self.output_type = dt.dtype(output_type)
     self.spark_output_type = spark_dtype(self.output_type)
Example #4
0
def compile_elementwise_udf(t, expr, scope, timecontext, **kwargs):
    op = expr.op()
    spark_output_type = spark_dtype(op._output_type)
    spark_udf = pandas_udf(op.func, spark_output_type, PandasUDFType.SCALAR)
    func_args = (t.translate(arg, scope, timecontext) for arg in op.func_args)
    return spark_udf(*func_args)