def generate_pre_transform_specs_data_frame(self,
                                                spark_context=None,
                                                sql_context=None):

        data_frame_reader = DataFrameReader(sql_context)
        pre_transform_specs_data_frame = data_frame_reader.jdbc(
            DbUtil.get_java_db_connection_string(), 'pre_transform_specs')
        data = []
        for item in pre_transform_specs_data_frame.collect():
            spec = json.loads(item['pre_transform_spec'])
            data.append(json.dumps(spec))

        data_frame = sql_context.read.json(spark_context.parallelize(data))
        self.pre_transform_specs_data_frame = data_frame
    def generate_pre_transform_specs_data_frame(self, spark_context=None,
                                                sql_context=None):

        data_frame_reader = DataFrameReader(sql_context)
        pre_transform_specs_data_frame = data_frame_reader.jdbc(
            self.get_connection_string(),
            'pre_transform_specs'
        )
        data = []
        for item in pre_transform_specs_data_frame.collect():
            spec = json.loads(item['pre_transform_spec'])
            data.append(json.dumps(spec))

        data_frame = sql_context.jsonRDD(spark_context.parallelize(data))
        self.pre_transform_specs_data_frame = data_frame
    def generate_transform_specs_data_frame(self, spark_context=None,
                                            sql_context=None):

        data_frame_reader = DataFrameReader(sql_context)
        transform_specs_data_frame = data_frame_reader.jdbc(
            DbUtil.get_java_db_connection_string(),
            'transform_specs'
        )
        data = []
        for item in transform_specs_data_frame.collect():
            spec = json.loads(item['transform_spec'])
            data.append(json.dumps(spec))

        data_frame = sql_context.read.json(spark_context.parallelize(data))
        self.transform_specs_data_frame = data_frame
Exemple #4
0
def method2(sql_context: SQLContext, database_URL: str,
            database_properties: dict):
    print('fetching jdbc dataframe...')
    # Create a DataFrameReader interface
    jdbc_df = DataFrameReader(sql_context).option("fetchSize", "5001")
    # Create a DataFrame object
    jdbc_df = jdbc_df.jdbc(
        url=database_URL,
        table='RATINGS',
        # column="SERVICE_ID",
        # lowerBound="0",
        # upperBound="4",
        # numPartitions=4,
        properties=database_properties)

    return jdbc_df