Example #1
0
def main():
    # input parameters
    if len(sys.argv) < 4:
        print("you didnt give directory inputs, using test file")
        input_dir = "test_input"
        input_fn = "ratings_tiny_processed.csv"
        input_file_path = get_abs_file_path(input_dir, input_fn)
        output_fn="test"
    else:
        input_fn = sys.argv[1]
        output_prob_fn = sys.argv[2]
        output_lift_fn = sys.argv[3]
        input_dir = "data"
        input_file_path = get_abs_file_path(input_dir, input_fn)

    # initialize spark
    conf = SparkConf().setMaster("local").setAppName("spark_cooccurrences.py")
    conf.setExectorMemory("3g")
    conf.setExecutorCores( 6)
    sc = SparkContext(conf = conf)

    # read in file
    data = sc.textFile(input_file_path)

    # take out header
    header = data.first()
    data = data.filter(lambda x: x != header)

    # int, dont count header
    n_reviews = data.count() - 1 

    # need to convert list of strings to key value pairs
    #[[u1, mi], ..]
    user_pairs = data.map(lambda x: [int(i) for i in x.split(",")])

    # sorted makes sure that i,j == j,i
    # group pairs [(ui, [sortedmovies_ij]]
    grouped_users = user_pairs.groupByKey().map(lambda x: (x[0], sorted(x[1])))

    # grouped pairs by users and dictionary [(u1, dict1), ..., (ui,dictj)]
    # Using dictionary (stripes) reduces communication costs
    # [(ui, {m_j:{m_k: count_ijk}), ...] count is 1 for all movies
    filtered_movies = grouped_users.map(lambda x: len(x[1]) < 2)
    user_movie_dicts = grouped_users.map(lambda x: (x[0], create_movie_dict(x[1]) ))

    # make key pairs of movie_i, stripe_i
    # [(movie_i, stripe_i), ...]
    movie_stripes = user_movie_dicts.flatMap(lambda x: create_stripe(x[1]))

    # aggregate stripes and sum counts
    # [(m1, {m2:count2, m4:count4}), ...] 
    combined_stripes = movie_stripes.reduceByKey(lambda x,y:
                                              increment_stripes(x,y))

    # convert to pair values and print
    # (mi, mj), count
    counts = combined_stripes.flatMap(lambda (m_i, stripe): convert_stripes_to_tups(m_i, stripe))

    # Count pairs, default_dict type
    stripe_count_dict = movie_pairs.countByValue()

    # first creat a list of keys, [(A,B), ..]
    keys_rdd = sc.parallelize(stripe_count_dict.keys()).cache()

    # p(a|b) := p(a&b)/p(b) = |a&b|/|b| <- magnitudes or counts
    # Perform calc P(A|B)
    conditional_probs_rdd = keys_rdd.map(lambda k: (k, float(stripe_count_dict[k]) /
                                       movie_counts_dict[k[1]]))

    trimmed_conditionals = conditional_probs_rdd.filter(lambda x: x[1] < 0.8)

    # lift(A&B) := P(A&B)/P(A) = |A&B|/|A| <- magnitudes or counts
    # Perform lift calc P(A&B)/P(A)
    lift_rdd = keys_rdd.map(lambda k: (k, float(stripe_count_dict[k]) /
                                       movie_counts_dict[k[0]]))

    trimmed_lift = conditional_probs_rdd.filter(lambda x: x[1] > 1.6)

    # Output results
    output_dir = "output/spark"
    save_rdd_to_disk(output_dir, output_prob_fn, trimmed_conditionals)
    save_rdd_to_disk(output_dir, output_lift_fn, trimmed_lift)