コード例 #1
0
 def select(self, db_table):
     """Query engine to this DAO implementation"""
     try:
         return spark.getOrCreate().read\
         .format('jdbc')\
         .option('url', "jdbc:postgresql://%s:%s/%s" % (self.host, self.port, self.database))\
         .option('dbtable', db_table)\
         .option('user', self.user)\
         .option('password', self.password)\
         .option('driver', 'org.postgresql.Driver')\
         .load()     
     except Exception as error: 
         logger.error(error)
コード例 #2
0
ファイル: rules.py プロジェクト: marcosvgj/stackoverflow_case
    def dictionaryization(metadata):
        """ Responsible to implement business logic to dimensional tables """
        model = metadata.get('model')
        field = metadata.get('description_field')

        if metadata.get('embbebedList') == False:
            data = check_nullity(field=field, metadata=metadata)\
                .select(field)\
                .distinct()\
                .collect()
        else:
            data = check_nullity(field=field, metadata=metadata)\
                .select(field)\
                .withColumn(field, split(col(field), ';'))\
                .withColumn(field, explode(col(field)))\
                .distinct()\
                .collect()
        
        return spark.getOrCreate().createDataFrame(DimensionRules.create_index(field, data), model().schema)
コード例 #3
0
ファイル: rules.py プロジェクト: marcosvgj/stackoverflow_case
    def generate_intermediary_entity(metadata):
        model = metadata.get('model')
        options = metadata.get('options')
        field = metadata.get('description_field')
        table = options.get('right_table')
        database = options.get("right_database")
        
        right_table = PostgresDAO().select('%s.%s' % (database, table))
        left_table = check_nullity(metadata=metadata, field=field)
        
        if metadata.get('embbebedList') == False:
            data = left_table\
                .select(options.get('left_columns_selected'))\
                .join(right_table, col(options.get('right_original_column_target')) == getattr(right_table, options.get("right_column_target")))\
                .withColumnRenamed(options.get('left_original_column_id'), options.get('left_column_id'))
            
            data = MiddleEntityRules.createIndex(options.get('left_column_id'), options.get('right_column_id'), data)\
                .select(model().schema.fieldNames())
        else:
            data = left_table\
                .select(options.get('left_columns_selected'))\
                .withColumn(field, split(col(options.get("right_original_column_target")), ';'))\
                .withColumn(field, explode(col(options.get("right_original_column_target"))))\
                .join(right_table, col(options.get('right_original_column_target')) == getattr(right_table, options.get("right_column_target")))\
                .withColumnRenamed(options.get('left_original_column_id'), options.get('left_column_id'))\
                .withColumn(options.get('left_column_id'), col(options.get('left_column_id')).cast(IntegerType()))

            data = MiddleEntityRules.createIndex(options.get('left_column_id'), options.get('right_column_id'), data)
        if options.get("has_adicional_columns") == True:
            for i in options.get("adicional_columns"):
                setup = options.get("adicional_columns_definition")[i]
                data = data.withColumnRenamed(i, setup["name"])
                if setup.get("has_conversion_values") == True:
                    global MAPPER
                    MAPPER = spark.getOrCreate().sparkContext.broadcast(setup["conversion_values"])
                    data = data.withColumn(setup['name'], conversor(setup['name']))
                    data = data.withColumn(setup['name'], col(setup['name']).cast(get_type(setup['column_type'])))
        return data.select(model().schema.fieldNames())
コード例 #4
0
ファイル: base.py プロジェクト: marcosvgj/stackoverflow_case
 def from_csv(path, header=True):
     return spark.getOrCreate().read.format("csv")\
     .option("header", header).load(path)
コード例 #5
0
ファイル: rules.py プロジェクト: marcosvgj/stackoverflow_case
from common.utils import spark
from dao.postgres import PostgresDAO
from pyspark.sql.functions import col, explode, split, when, lit, udf, concat, asc, row_number
from dao.external.api.exchange import ExchangeDAO
from common.utils.utils import get_type
from pyspark.sql.window import Window
from pyspark.sql.types import *

currencies = spark.getOrCreate().sparkContext.broadcast(ExchangeDAO(sleep=0.5, retries=2, timeout=5).collect())

class DimensionRules(object):
    @staticmethod
    def create_index(field, source):
        return map(lambda x, y: (x+1, y.asDict()[field]), range(0, len(source)), source)
    
    @staticmethod
    def dictionaryization(metadata):
        """ Responsible to implement business logic to dimensional tables """
        model = metadata.get('model')
        field = metadata.get('description_field')

        if metadata.get('embbebedList') == False:
            data = check_nullity(field=field, metadata=metadata)\
                .select(field)\
                .distinct()\
                .collect()
        else:
            data = check_nullity(field=field, metadata=metadata)\
                .select(field)\
                .withColumn(field, split(col(field), ';'))\
                .withColumn(field, explode(col(field)))\