Python FileSchema Examples

Programming Language: Python

Namespace/Package Name: com.rposam.schema.FileSchema

Class/Type: FileSchema

Examples at hotexamples.com: 4

Python FileSchema - 4 examples found. These are the top rated real world Python examples of com.rposam.schema.FileSchema.FileSchema extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

FileSchema(2)

empSchema(1)

randomuserapiSchema(1)

Example #1

Show file

File: PostgrecopyVSDataFrameWriter.py Project: ramposam/pyspark_allsources

 def getSparkDF(self, input_path):
     conf = SparkConfiguration().getSparkConf()
     spark = SparkSession.builder. \
         config(conf=conf). \
         appName("pyspark postgres performance test"). \
         getOrCreate()
     schema = FileSchema().empSchema()
     df = spark.read.format("csv").load(input_path,
                                        schema=schema,
                                        inferSchema=False)
     print("No of Partitions:{0}".format(df.rdd.getNumPartitions()))
     return df

Example #2

Show file

from com.rposam.util.logger import Log4j

if __name__ == "__main__":
    conf = SparkConfiguration.getSparkConf()

    Driver = SparkSession. \
        builder. \
        config(conf=conf). \
        appName("ETL Pipeline using Airflow CSV To Parquet"). \
        getOrCreate()

    logger = Log4j(Driver)

    logger.info("Fetching schema of source file")
    schema = FileSchema.empSchema()

    source = sys.argv[1]
    target = sys.argv[2]

    logger.info("Source is {0} and target is {1}".format(source, target))

    logger.info("Started reading data from sources")

    empDF = Driver.read. \
        format("csv"). \
        schema(schema=schema). \
        option("header", "false"). \
        load(source)
    empDF.show()

Example #3

Show file

File: CustomUDFEmailGender.py Project: ramposam/pyspark_allsources

    if (re.search(regex, str(email))):
        return True
    else:
        return False


def parseGender(gender):
    if str(gender) in ["Male", "M", "male", "m", "MALE"]:
        return "M"
    elif str(gender) in ["Female", "F", "f", "female", "feMale", "FEMALE"]:
        return "F"
    else:
        return "N"


FileSchema = FileSchema()._20000recordsSchema()

if __name__ == "__main__":
    conf = SparkConfiguration().getSparkConf()
    warehouseLocation = "hdfs://localhost:8020/user/hive/warehouse/sparkdb.db"
    spark = SparkSession. \
        builder. \
        config(conf=conf). \
        appName("Custom UDF for Email and Gender validations and also a LTI Assignment"). \
        config("hive.metastore.uris", "thrift://localhost:9083"). \
        enableHiveSupport(). \
        config("spark.sql.shuffle.partitions", 10). \
        getOrCreate()

    logger = Log4j(spark)
    logger.info("Reading csv file with dropmalformed mode")

Example #4

Show file

File: writeToHive.py Project: ramposam/pyspark_allsources

import sys

from pyspark.sql import SparkSession
from pyspark.sql.functions import expr
from pyspark.sql.types import StructType, StructField, ArrayType, StringType
from pyspark.sql.functions import col, explode
from com.rposam.util.logger import Log4j
from com.rposam.schema.FileSchema import FileSchema
from com.rposam.config.SparkConf import SparkConfiguration
import os
schema = FileSchema.randomuserapiSchema()

if __name__ == "__main__":
    conf = SparkConfiguration.getSparkConf()
    # for locally installed spark and hadoop
    # warehouseLocation = "hdfs://localhost:8020/user/hive/warehouse/sparkdb.db"
    # thriftServer ="thrift://localhost:9083"

    # for itversity cluster
    warehouseLocation = "hdfs://nn01.itversity.com:8020/user/rposam2021/warehouse/rposam2021_hivedb.db"
    thriftServer = "thrift://gw02.itversity.com:9083"
    os.environ["HADOOP_USER_NAME"] = "rposam2021"
    spark = SparkSession.builder.\
        appName("Read json and write to local installed spark on ubuntu"). \
        config("spark.sql.warehouse.dir", warehouseLocation). \
        config("hive.metastore.uris", thriftServer). \
        enableHiveSupport().\
        getOrCreate()
    logger = Log4j(spark)
    logger.info("Spark session created using enableHivesupport")
    df = spark.read.schema(schema=schema).option(