Python SparkContext.addFile Examples

Programming Language: Python

Namespace/Package Name: pyspark.context

Class/Type: SparkContext

Method/Function: addFile

Examples at hotexamples.com: 8

Python SparkContext.addFile - 8 examples found. These are the top rated real world Python examples of pyspark.context.SparkContext.addFile extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

SparkContext(30)

textFile(30)

stop(30)

getOrCreate(30)

parallelize(30)

setLogLevel(24)

broadcast(18)

_ensure_initialized(11)

setSystemProperty(9)

setCheckpointDir(9)

addFile(8)

addPyFile(6)

union(5)

pickleFile(5)

wholeTextFiles(5)

accumulator(5)

getConf(3)

newAPIHadoopFile(2)

setJobGroup(2)

binaryFiles(2)

join(1)

hadoopFile(1)

read_csv(1)

sequenceFile(1)

flatMap(1)

emptyRDD(1)

_stop(1)

map(1)

Example #1

Show file

File: preprocess.py Project: aspk/askedagain

def main():
    spark_conf = SparkConf().setAppName("Text Preprocesser").set(
        "spark.cores.max", "30")

    global sc
    sc = SparkContext(conf=spark_conf)
    sc.setLogLevel("ERROR")
    sc.addFile(
        os.path.dirname(os.path.dirname(os.path.abspath(__file__))) +
        "/lib/util.py")
    sc.addFile(
        os.path.dirname(os.path.dirname(os.path.abspath(__file__))) +
        "/config/config.py")

    global sql_context
    sql_context = SQLContext(sc)

    start_time = time.time()
    preprocess_all()
    end_time = time.time()
    print(
        colored(
            "Preprocessing run time (seconds): {0}".format(end_time -
                                                           start_time),
            "magenta"))

Example #2

Show file

File: preprocess.py Project: matthewsilver/mergereduce

def main():
    #spark_conf = SparkConf().setAppName("Text Preprocesser").set("spark.cores.max", "30")

    global sc
    #sc = SparkContext(conf=spark_conf)
    sc_conf = SparkConf()
    sc_conf.set("spark.redis.host", config.REDIS_SERVER)
    sc_conf.set("spark.redis.port", "6379")
    sc = SparkContext(conf=sc_conf)
    sc.setLogLevel("ERROR")
    sc.addFile(
        os.path.dirname(os.path.dirname(os.path.abspath(__file__))) +
        "/lib/util.py")
    sc.addFile(
        os.path.dirname(os.path.dirname(os.path.abspath(__file__))) +
        "/config/config.py")

    global sql_context
    sql_context = SQLContext(sc)

    start_time = time.time()
    preprocess_files(config.S3_BUCKET, config.S3_FOLDER_EXTRACTED)
    end_time = time.time()
    print(
        colored(
            "Preprocessing run time (seconds): {0}".format(end_time -
                                                           start_time),
            "magenta"))

Example #3

Show file

File: spark_mllib_tag_indexed_batch.py Project: aspk/askedagain

def main():
    spark_conf = SparkConf().setAppName("Spark Custom MinHashLSH").set(
        "spark.cores.max", "30")

    global sc
    sc = SparkContext(conf=spark_conf)
    sc.setLogLevel("ERROR")
    sc.addFile(
        os.path.dirname(os.path.dirname(os.path.abspath(__file__))) +
        "/lib/min_hash.py")
    sc.addFile(
        os.path.dirname(os.path.dirname(os.path.abspath(__file__))) +
        "/lib/locality_sensitive_hash.py")
    sc.addFile(
        os.path.dirname(os.path.dirname(os.path.abspath(__file__))) +
        "/lib/util.py")
    sc.addFile(
        os.path.dirname(os.path.dirname(os.path.abspath(__file__))) +
        "/config/config.py")

    global sql_context
    sql_context = SQLContext(sc)

    start_time = time.time()
    run_minhash_lsh()
    end_time = time.time()
    print(
        colored(
            "Spark Custom MinHashLSH run time (seconds): {0} seconds".format(
                end_time - start_time), "magenta"))

Example #4

Show file

def main():
    spark_conf = SparkConf().setAppName("Spark Custom MinHashLSH").set("spark.cores.max", "30")

    global sc
    global sql_context    

    sc = SparkContext(conf=spark_conf)
    sc.setLogLevel("ERROR")
    sql_context = SQLContext(sc)
    sc.addFile(os.path.dirname(os.path.dirname(os.path.abspath(__file__))) + "/lib/util.py")
    sc.addFile(os.path.dirname(os.path.dirname(os.path.abspath(__file__))) + "/config/config.py")


    start_time = time.time()
    similarity_scores_df = compare_text()

    config = configparser.ConfigParser()
    config.read('../config/db_properties.ini')
    similarity_scores_df.write.jdbc(config['postgres']['url'], config['postgres']['table'], mode='overwrite', properties={'user': config['postgres']['user'], 'password': config['postgres']['password']})    

    end_time = time.time()
    print(colored("Spark MinHash run time (seconds): {0} seconds".format(end_time - start_time), "magenta"))

Example #5

Show file


if __name__ == '__main__':

    # Creating the streaming spark context and spark session
    sp_context = SparkContext('local[2]', "Read_Stream")
    ssp_context = StreamingContext(sp_context, 5.000)
    sp_sess = SparkSession.builder.appName('Read_Data').getOrCreate()

    # Adding the files to Spark Context
    # sp_context.addFile("calc_stats_1.py")
    # sp_context.addFile("calc_stats_2.py")
    # sp_context.addFile("init_stats.py")
    # sp_context.addFile("model.py")
    # sp_context.addFile("chances_of_winning.py")
    sp_context.addFile('metrics.py')

    # Reading the CSV files using Spark session
    players = sp_sess.read.csv(play_path, header=True, inferSchema=True)
    teams = sp_sess.read.csv(team_path, header=True, inferSchema=True)

    # Importing the files
    from metrics import *
    """global player_chemistry
    global player_ratings
    global regr_player
    global player_profile
    global match_details
"""
    # Initializing the player chem and ratings
    player_chemistry = init_chemistry(players)

Example #6

Show file

    write_to_file("player_rate.json", player_rate_json)
    write_to_file("player_profile.json", player_prof_json)
    write_to_file("match_details.json", match_details_json)

    # Writing the JSON strings to files


if __name__ == '__main__':

    # Creating the streaming spark context and spark session
    sp_context = SparkContext('local[2]', "Read_Stream")
    ssp_context = StreamingContext(sp_context, 5.000)
    sp_sess = SparkSession.builder.appName('Read_Data').getOrCreate()

    # Adding the files to Spark Context
    sp_context.addFile("calc_stats_1.py")
    sp_context.addFile("calc_stats_2.py")
    sp_context.addFile("init_stats.py")
    sp_context.addFile("model.py")
    sp_context.addFile("chances_of_winning.py")

    # Reading the CSV files using Spark session
    players = sp_sess.read.csv(play_path, header=True, inferSchema=True)
    teams = sp_sess.read.csv(team_path, header=True, inferSchema=True)

    # Importing the files
    from calc_stats_1 import *
    from calc_stats_2 import *
    from init_stats import *
    from model import *
    from chances_of_winning import *

Example #7

Show file

                    "own_goals": i['own_goals'],
                    "yellow_cards": i['yellow_cards'],
                    "red_cards": i['red_cards']
                }
                json_object = json.dumps(dictionary, indent=4)
                # Writing to sample.json
                with open("output_req_2.json", "w") as outfile:
                    print("Writing....to JSON")
                    outfile.write(json_object)
                break


if __name__ == "__main__":
    sp_context = SparkContext('local[2]', "UI")
    sp_sess = SparkSession.builder.appName('user_input').getOrCreate()
    sp_context.addFile("model.py")
    input_file = sys.argv[1]
    with open(input_file, 'r') as file:
        content = file.read()
        input_data = eval(content)
        if input_data["req_type"] == 1:
            # calling predict function:
            """
            output = predict(input_)
            """
            predict_helper(input_data)

        elif input_data["req_type"] == 2:
            # calling profile function
            player_profile_helper(input_data)

Example #8

Show file

File: ui.py Project: AshishAsokan/BigData_Assignment2020

                json_object = json.dumps(dictionary, indent=4) 
                # Writing to sample.json 
                with open("output_req_3.json", "w") as outfile:
                    outfile.write(json_object) 
                return
                break
    with open("output_req_3.json", "w") as outfile:
        dicte = {"match_found":False}
        outfile.write(json.dumps(dicte, indent=4))    



if __name__ == "__main__":
    sp_context = SparkContext('local[2]', "UI")
    sp_sess = SparkSession.builder.appName('user_input').getOrCreate()
    sp_context.addFile("metrics.py")
    input_file = sys.argv[1]
    with open(input_file, 'r') as file:
        content = file.read()
        input_data = eval(content)
        if 'req_type' not in input_data:
            # calling match info function 
            match_data_helper(input_data)
        else:
            if input_data["req_type"] == 1:
                # calling predict function:
                """
                output = predict(input_)
                """
                predict_helper(input_data)