Python SparkConf.setExectorMemory Examples

Programming Language: Python

Namespace/Package Name: pyspark

Class/Type: SparkConf

Method/Function: setExectorMemory

Examples at hotexamples.com: 1

Python SparkConf.setExectorMemory - 1 examples found. These are the top rated real world Python examples of pyspark.SparkConf.setExectorMemory extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

SparkConf(30)

set(30)

setAll(30)

setAppName(30)

setMaster(30)

get(20)

getAll(13)

setExecutorEnv(13)

toDebugString(7)

setIfMissing(4)

contains(1)

read(1)

setExectorMemory(1)

setExecutorCores(1)

setSparkHome(1)

textFile(1)

Example #1

Show file

def main():
    # input parameters
    if len(sys.argv) < 4:
        print("you didnt give directory inputs, using test file")
        input_dir = "test_input"
        input_fn = "ratings_tiny_processed.csv"
        input_file_path = get_abs_file_path(input_dir, input_fn)
        output_fn="test"
    else:
        input_fn = sys.argv[1]
        output_prob_fn = sys.argv[2]
        output_lift_fn = sys.argv[3]
        input_dir = "data"
        input_file_path = get_abs_file_path(input_dir, input_fn)

    # initialize spark
    conf = SparkConf().setMaster("local").setAppName("spark_cooccurrences.py")
    conf.setExectorMemory("3g")
    conf.setExecutorCores( 6)
    sc = SparkContext(conf = conf)

    # read in file
    data = sc.textFile(input_file_path)

    # take out header
    header = data.first()
    data = data.filter(lambda x: x != header)

    # int, dont count header
    n_reviews = data.count() - 1 

    # need to convert list of strings to key value pairs
    #[[u1, mi], ..]
    user_pairs = data.map(lambda x: [int(i) for i in x.split(",")])

    # sorted makes sure that i,j == j,i
    # group pairs [(ui, [sortedmovies_ij]]
    grouped_users = user_pairs.groupByKey().map(lambda x: (x[0], sorted(x[1])))

    # grouped pairs by users and dictionary [(u1, dict1), ..., (ui,dictj)]
    # Using dictionary (stripes) reduces communication costs
    # [(ui, {m_j:{m_k: count_ijk}), ...] count is 1 for all movies
    filtered_movies = grouped_users.map(lambda x: len(x[1]) < 2)
    user_movie_dicts = grouped_users.map(lambda x: (x[0], create_movie_dict(x[1]) ))

    # make key pairs of movie_i, stripe_i
    # [(movie_i, stripe_i), ...]
    movie_stripes = user_movie_dicts.flatMap(lambda x: create_stripe(x[1]))

    # aggregate stripes and sum counts
    # [(m1, {m2:count2, m4:count4}), ...] 
    combined_stripes = movie_stripes.reduceByKey(lambda x,y:
                                              increment_stripes(x,y))

    # convert to pair values and print
    # (mi, mj), count
    counts = combined_stripes.flatMap(lambda (m_i, stripe): convert_stripes_to_tups(m_i, stripe))

    # Count pairs, default_dict type
    stripe_count_dict = movie_pairs.countByValue()

    # first creat a list of keys, [(A,B), ..]
    keys_rdd = sc.parallelize(stripe_count_dict.keys()).cache()

    # p(a|b) := p(a&b)/p(b) = |a&b|/|b| <- magnitudes or counts
    # Perform calc P(A|B)
    conditional_probs_rdd = keys_rdd.map(lambda k: (k, float(stripe_count_dict[k]) /
                                       movie_counts_dict[k[1]]))

    trimmed_conditionals = conditional_probs_rdd.filter(lambda x: x[1] < 0.8)

    # lift(A&B) := P(A&B)/P(A) = |A&B|/|A| <- magnitudes or counts
    # Perform lift calc P(A&B)/P(A)
    lift_rdd = keys_rdd.map(lambda k: (k, float(stripe_count_dict[k]) /
                                       movie_counts_dict[k[0]]))

    trimmed_lift = conditional_probs_rdd.filter(lambda x: x[1] > 1.6)

    # Output results
    output_dir = "output/spark"
    save_rdd_to_disk(output_dir, output_prob_fn, trimmed_conditionals)
    save_rdd_to_disk(output_dir, output_lift_fn, trimmed_lift)