def select(self, db_table): """Query engine to this DAO implementation""" try: return spark.getOrCreate().read\ .format('jdbc')\ .option('url', "jdbc:postgresql://%s:%s/%s" % (self.host, self.port, self.database))\ .option('dbtable', db_table)\ .option('user', self.user)\ .option('password', self.password)\ .option('driver', 'org.postgresql.Driver')\ .load() except Exception as error: logger.error(error)
def dictionaryization(metadata): """ Responsible to implement business logic to dimensional tables """ model = metadata.get('model') field = metadata.get('description_field') if metadata.get('embbebedList') == False: data = check_nullity(field=field, metadata=metadata)\ .select(field)\ .distinct()\ .collect() else: data = check_nullity(field=field, metadata=metadata)\ .select(field)\ .withColumn(field, split(col(field), ';'))\ .withColumn(field, explode(col(field)))\ .distinct()\ .collect() return spark.getOrCreate().createDataFrame(DimensionRules.create_index(field, data), model().schema)
def generate_intermediary_entity(metadata): model = metadata.get('model') options = metadata.get('options') field = metadata.get('description_field') table = options.get('right_table') database = options.get("right_database") right_table = PostgresDAO().select('%s.%s' % (database, table)) left_table = check_nullity(metadata=metadata, field=field) if metadata.get('embbebedList') == False: data = left_table\ .select(options.get('left_columns_selected'))\ .join(right_table, col(options.get('right_original_column_target')) == getattr(right_table, options.get("right_column_target")))\ .withColumnRenamed(options.get('left_original_column_id'), options.get('left_column_id')) data = MiddleEntityRules.createIndex(options.get('left_column_id'), options.get('right_column_id'), data)\ .select(model().schema.fieldNames()) else: data = left_table\ .select(options.get('left_columns_selected'))\ .withColumn(field, split(col(options.get("right_original_column_target")), ';'))\ .withColumn(field, explode(col(options.get("right_original_column_target"))))\ .join(right_table, col(options.get('right_original_column_target')) == getattr(right_table, options.get("right_column_target")))\ .withColumnRenamed(options.get('left_original_column_id'), options.get('left_column_id'))\ .withColumn(options.get('left_column_id'), col(options.get('left_column_id')).cast(IntegerType())) data = MiddleEntityRules.createIndex(options.get('left_column_id'), options.get('right_column_id'), data) if options.get("has_adicional_columns") == True: for i in options.get("adicional_columns"): setup = options.get("adicional_columns_definition")[i] data = data.withColumnRenamed(i, setup["name"]) if setup.get("has_conversion_values") == True: global MAPPER MAPPER = spark.getOrCreate().sparkContext.broadcast(setup["conversion_values"]) data = data.withColumn(setup['name'], conversor(setup['name'])) data = data.withColumn(setup['name'], col(setup['name']).cast(get_type(setup['column_type']))) return data.select(model().schema.fieldNames())
def from_csv(path, header=True): return spark.getOrCreate().read.format("csv")\ .option("header", header).load(path)
from common.utils import spark from dao.postgres import PostgresDAO from pyspark.sql.functions import col, explode, split, when, lit, udf, concat, asc, row_number from dao.external.api.exchange import ExchangeDAO from common.utils.utils import get_type from pyspark.sql.window import Window from pyspark.sql.types import * currencies = spark.getOrCreate().sparkContext.broadcast(ExchangeDAO(sleep=0.5, retries=2, timeout=5).collect()) class DimensionRules(object): @staticmethod def create_index(field, source): return map(lambda x, y: (x+1, y.asDict()[field]), range(0, len(source)), source) @staticmethod def dictionaryization(metadata): """ Responsible to implement business logic to dimensional tables """ model = metadata.get('model') field = metadata.get('description_field') if metadata.get('embbebedList') == False: data = check_nullity(field=field, metadata=metadata)\ .select(field)\ .distinct()\ .collect() else: data = check_nullity(field=field, metadata=metadata)\ .select(field)\ .withColumn(field, split(col(field), ';'))\ .withColumn(field, explode(col(field)))\