Example #1
0
def main(argv=None):
    if argv is None:
        keyspace = sys.argv[1]
        output = sys.argv[2]

    # initialize spark cassandra
    cluster_seeds = ['199.60.17.136', '199.60.17.173']
    conf = SparkConf().set('spark.cassandra.connection.host',
                           ','.join(cluster_seeds))
    sc = pyspark_cassandra.CassandraSparkContext(conf=conf)

    #Get RDD from Cassandra
    request = sc.cassandraTable(keyspace, 'nasalogs', row_format=pyspark_cassandra.RowFormat.DICT) \
        .map(lambda line: (line['host'],(1, line['bytes'])))

    request2 = request.reduceByKey(add_tuples) \
        .map(lambda (host,(count, byte)): ("r",(1, count, byte, count*count, byte*byte, count*byte)))
    request3 = request2.reduceByKey(add_tuples)
    r = request3.map(lambda (key, sixsum): \
        (key, (sixsum[0]*sixsum[5] - sixsum[1]*sixsum[2])/(math.sqrt(sixsum[0]*sixsum[3]-sixsum[1]*sixsum[1]) \
        * math.sqrt(sixsum[0]*sixsum[4]-sixsum[2]*sixsum[2]))))
    r2 = r.map(lambda (key, value): (key + '2', value * value))

    #Save result
    outdata = r.union(r2)
    # I have thought about combining this data onto one node. It is safe since the output data is small.
    outdata.coalesce(1).saveAsTextFile(output)
def main():
    keyspace = sys.argv[1]
    output = sys.argv[2]
    orderkeys = sys.argv[3:]

    cluster_seeds = ['199.60.17.136', '199.60.17.173']
    conf = SparkConf().set('spark.cassandra.connection.host',
                           ','.join(cluster_seeds))
    conf.set('spark.dynamicAllocation.maxExecutors', 20)
    conf.setAppName('tpch orders sql')
    sc = pyspark_cassandra.CassandraSparkContext(conf=conf)
    sqlContext = SQLContext(sc)

    df_for_orders(keyspace, sc, sqlContext)
    df_for_lineitem(keyspace, sc, sqlContext)
    df_for_part(keyspace, sc, sqlContext)



    df_orders = sqlContext.sql("""SELECT o.orderkey, o.totalprice, p.name FROM
                                    orders o
                                    JOIN lineitem l ON (o.orderkey = l.orderkey)
                                    JOIN part p ON (l.partkey = p.partkey)
                                    WHERE o.orderkey in (""" + ",".join(orderkeys) \
                                +")")

    rdd_orders = df_orders.rdd
    rdd_orders = rdd_orders.map(map_key) \
               .reduceByKey(reduce_parts)\
               .sortByKey() \
               .map(map_output).coalesce(1)
    rdd_orders.saveAsTextFile(output)
Example #3
0
def main():
    keyspace = sys.argv[1]
    output = sys.argv[2]

    cluster_seeds = ['199.60.17.136', '199.60.17.173']
    conf = SparkConf().set('spark.cassandra.connection.host',
                           ','.join(cluster_seeds))
    conf.setAppName('correlate logs better in cassandra')

    sc = pyspark_cassandra.CassandraSparkContext(conf=conf)

    mapped_lines = rdd_for(keyspace, "nasalogs", sc, split_size=9000)

    mapped_tuples = mapped_lines.map(get_tuples)
    reduced_tuples = mapped_tuples.reduceByKey(add_tuples)

    reduced_tuples.cache()

    N = reduced_tuples.count()

    average_compute = reduced_tuples.map(lambda p: (p[1][0], p[1][1]))
    bytes_total, requests_total = average_compute.reduce(add_tuples)

    bytes_avg = 1.0 * bytes_total / N
    requests_avg = 1.0 * requests_total / N

    temp = reduced_tuples.map(lambda p: calculate(p, bytes_avg, requests_avg))
    numerator, denom_1, denom_2 = temp.reduce(add_tuples)
    denom_1 = math.sqrt(denom_1)
    denom_2 = math.sqrt(denom_2)
    denom = denom_1 * denom_2
    result = numerator / denom

    temp = ['r = %f' % (result), 'r^2 = %f' % (result**2)]
    sc.parallelize(temp, numSlices=1).saveAsTextFile(output)
def main(argv=None):
    if argv is None:
        inputs = sys.argv[1]
        keyspace = sys.argv[2]
        table = sys.argv[3]

    # initialize spark cassandra
    cluster_seeds = ['199.60.17.136', '199.60.17.173']
    conf = SparkConf().set('spark.cassandra.connection.host',
                           ','.join(cluster_seeds))
    sc = pyspark_cassandra.CassandraSparkContext(conf=conf)
    sqlContext = SQLContext(sc)

    #Process input file based on table_name
    if table == "yelp_review":
        df = sqlContext.read.format('com.databricks.spark.csv') \
            .options(header='true').load(inputs) \
            .select("user_id","review_id","votes_cool","business_id","votes_funny", \
            "stars","date","votes_useful")

        rdd = df.rdd.map(lambda line: (line[0],line[1],int(line[2]), \
            line[3],int(line[4]),int(line[5]),line[6], int(line[7])))

        columns = ["user_id","review_id","votes_cool","business_id","votes_funny", \
            "stars","date","votes_useful"]

    elif table == "yelp_business":
        df = sqlContext.read.format('com.databricks.spark.csv') \
            .options(header='true').load(inputs) \
            .select("business_id","name","review_count","state","full_address",\
            "open","city","latitude","longitude","stars")
        rdd = df.rdd.map(lambda line: (line[0],line[1],int(line[2]),line[3], \
            line[4],line[5],line[6],float(line[7]),float(line[8]),float(line[9])))
        columns = ["business_id","name","review_count","state","full_address",\
            "open","city","latitude","longitude","stars"]

    elif table == "yelp_business_checkin":
        df = sqlContext.read.format('com.databricks.spark.csv') \
            .options(header='true').load(inputs) \
            .select("business_id","day","hour","checkin")
        rdd = df.rdd.map(lambda line: (str(uuid.uuid1()), line[0], int(line[
            1]), int(line[2]), int(float(line[3]))))
        columns = ["id", "business_id", "day", "hour", "checkin"]

    elif table == "yelp_user":
        df = sqlContext.read.format('com.databricks.spark.csv') \
            .options(header='true').load(inputs) \
            .select("user_id","name","yelping_since")
        rdd = df.rdd.map(lambda line: (line[0], line[1], '12345678', line[2]))
        columns = ["user_id", "name", "password", "yelping_since"]
    #Save result to Cassandra
    rdd.saveToCassandra(keyspace,
                        table,
                        columns=columns,
                        batch_size=300,
                        parallelism_level=1000)
Example #5
0
def main():
        keyspace = sys.argv[1]
        output = sys.argv[2]
        orderkeys = sys.argv[3:]

        cluster_seeds = ['199.60.17.136', '199.60.17.173']
        conf = SparkConf().set('spark.cassandra.connection.host', ','.join(cluster_seeds))
        conf.set('spark.dynamicAllocation.maxExecutors', 20)
        conf.setAppName('tpch orders denorm cassandra')
        sc = pyspark_cassandra.CassandraSparkContext(conf=conf)
        sqlContext = SQLContext(sc)

        orders_rdd = rdd_for_orders(keyspace, sc, sqlContext, orderkeys)

        orders_rdd = orders_rdd.map(map_output)
        orders_rdd.saveAsTextFile(output)
def main(argv=None):
    if argv is None:
        inputs = sys.argv[1]
        keyspace = sys.argv[2]
    
    # initialize spark cassandra     
    cluster_seeds = ['199.60.17.136', '199.60.17.173']
    conf = SparkConf().set('spark.cassandra.connection.host', ','.join(cluster_seeds))
    sc = pyspark_cassandra.CassandraSparkContext(conf=conf)

    #Get RDD from input file
    text = sc.textFile(inputs)
    linere = re.compile("^(\\S+) - - \\[(\\S+) [+-]\\d+\\] \"[A-Z]+ (\\S+) HTTP/\\d\\.\\d\" \\d+ (\\d+)$")    
    request = text.map(lambda line: linere.split(line)) \
        .filter(lambda lineSplit: len(lineSplit) > 4) \
        .map(lambda lineSplit: (str(uuid.uuid1()), lineSplit[1],datetime.datetime.strptime(lineSplit[2], '%d/%b/%Y:%H:%M:%S').isoformat(),\
            lineSplit[3], int(lineSplit[4])))
    
    #Save result to Cassandra
    request.saveToCassandra(keyspace, 'nasalogs', columns=["id", "host", "datetime", "path", "bytes"], batch_size=300, parallelism_level=1000 )          
def main():
	#Defining input directory, keyspace and table name
	inputs = sys.argv[1]
	keyspace = sys.argv[2]

	#Cluster configuration
	cluster_seeds = ['199.60.17.136', '199.60.17.173']
	conf = SparkConf().set('spark.cassandra.connection.host', ','.join(cluster_seeds))
	sc = pyspark_cassandra.CassandraSparkContext(conf=conf)

	#Reading the input file, and then matching the pattern
	file_data = sc.textFile(inputs)
	linere = re.compile("^(\\S+) - - \\[(\\S+) [+-]\\d+\\] \"[A-Z]+ (\\S+) HTTP/\\d\\.\\d\" \\d+ (\\d+)$")
	
	#Mapping the data after fetching the required values out of the Nasa Web server logs file
	KeyValue = file_data.map(lambda line : linere.split(line)).filter(lambda x : len(x)==6).map(lambda y : (y[1],(dt.datetime.strptime(y[2], '%d/%b/%Y:%H:%M:%S')),y[3],y[-2])).cache()
	
	#Mapping the KeyValue RDD as the required format of 4 columns
	Nasa = KeyValue.map(lambda p: {"host": p[0], "datetime": p[1], "path": p[2], "bytes": long(p[3])})
	
	Nasa.saveToCassandra(keyspace, 'nasalogs')
Example #8
0
def main():
    input_keyspace = sys.argv[1]
    output_keyspace = sys.argv[2]

    cluster_seeds = ['199.60.17.136', '199.60.17.173']
    conf = SparkConf().set('spark.cassandra.connection.host',
                           ','.join(cluster_seeds))
    conf.set('spark.dynamicAllocation.maxExecutors', 20)
    conf.setAppName('tpch orders denormalize cassandra')
    sc = pyspark_cassandra.CassandraSparkContext(conf=conf)
    sqlContext = SQLContext(sc)

    df_for_part(input_keyspace, sc, sqlContext)
    df_lineitems = df_for_lineitem(input_keyspace, sc, sqlContext)

    df_part = sqlContext.sql("""SELECT l.orderkey, p.name
                                  FROM lineitem l
                                  JOIN part p ON (l.partkey = p.partkey)""")

    df_part_names = rows_to_list(df_part, "orderkey", "name", "names",
                                 sqlContext)

    df_part_names.registerTempTable("part_names")

    df_orders = df_for_orders(input_keyspace, sc, sqlContext)

    df_orders = sqlContext.sql("""SELECT o.*, p.names as part_names
                                    FROM
                                    orders o
                                    JOIN part_names p ON (o.orderkey = p.orderkey) """
                               )

    rdd_orders = df_orders.rdd.map(map_primary_key)

    rdd_orders.saveToCassandra(output_keyspace,
                               'orders_parts',
                               parallelism_level=64)
Example #9
0
    cdata = dict((f, v) for v, (f, t) in zip(data, types))
    return cdata


def read_table(keyspace, sc, input_dir, tbl, types):
    infile = os.path.join(input_dir, tbl + '.tbl.gz')
    cdata = sc.textFile(infile).repartition(100).map(
        lambda line: to_cassandra_data(line, types)).setName(tbl)
    cdata.saveToCassandra(keyspace,
                          tbl,
                          consistency_level=ConsistencyLevel.ONE)


def read_tables(sc, input_dir, keyspace):
    for tbl, types, primarykey in tables:
        read_table(keyspace, sc, input_dir, tbl, types)


if __name__ == '__main__':
    input_dir = sys.argv[1]
    keyspace = sys.argv[2]

    from pyspark import SparkConf
    import pyspark_cassandra

    conf = SparkConf().setAppName('TPC ingest') \
        .set("spark.cassandra.connection.host", ','.join(cluster_seeds))
    sc = pyspark_cassandra.CassandraSparkContext(conf=conf)

    read_tables(sc, input_dir, keyspace)
Example #10
0
def main():
    outdir = sys.argv[1]

    cluster_seeds = ['199.60.17.136', '199.60.17.173']
    conf = SparkConf().set('spark.cassandra.connection.host',
                           ','.join(cluster_seeds)).set(
                               'spark.dynamicAllocation.maxExecutors', 20)
    sc = pyspark_cassandra.CassandraSparkContext(conf=conf)
    sqlContext = SQLContext(sc)

    # Loading yelp_review data
    yelp_review = sc.cassandraTable("bigbang", "yelp_review").select('user_id','business_id','stars')\
        .map(lambda r: (r['user_id'],r['business_id'],r['stars'])).cache()

    user_id_lst = list(set(yelp_review.map(lambda r: r[0]).collect()))
    user_dict = map_string_to_int(user_id_lst)

    business_id_lst = list(set(yelp_review.map(lambda r: r[1]).collect()))
    business_dict = map_string_to_int(business_id_lst)

    ratings = yelp_review.map(
        lambda r: (user_dict[r[0]], business_dict[r[1]], r[2])).cache()

    # Splitting Ratings data into Training set, Validation set and Test set.
    training_RDD, validation_RDD, test_RDD = ratings.randomSplit([6, 2, 2],
                                                                 seed=0L)
    validation_for_predict_RDD = validation_RDD.map(lambda x:
                                                    (x[0], x[1])).cache()
    test_for_predict_RDD = test_RDD.map(lambda x: (x[0], x[1])).cache()

    # Training Phase
    seed = 5L
    iterations = [10, 15, 20]
    regularization_parameters = [1.0, 0.1, 0.01]
    ranks = [12, 16, 20]
    min_error = float('inf')
    best_rank = -1
    for rank in ranks:
        for iteration in iterations:
            for regularization_parameter in regularization_parameters:
                model = ALS.train(training_RDD,
                                  rank,
                                  iteration,
                                  lambda_=regularization_parameter,
                                  seed=seed)
                predictions = model.predictAll(validation_for_predict_RDD).map(
                    lambda r: ((r[0], r[1]), r[2]))
                rates_and_preds = validation_RDD.map(
                    lambda r: ((r[0], r[1]), r[2])).join(predictions)
                error = math.sqrt(
                    rates_and_preds.map(lambda r:
                                        (r[1][0] - r[1][1])**2).mean())
                print 'For rank %s , iteration %s , regularization parameter %s the RMSE is %s' % (
                    rank, iteration, regularization_parameter, error)
                if error < min_error:
                    min_error = error
                    best_rank = rank
                    best_iteration = iteration
                    best_regularizer = regularization_parameter

    print 'The best model was trained with rank %s , iteration %s , regularization parameter %s' % (
        best_rank, best_iteration, best_regularizer)

    # Test the selected model
    model = ALS.train(training_RDD,
                      best_rank,
                      best_iteration,
                      lambda_=best_regularizer,
                      seed=seed)
    predictions = model.predictAll(test_for_predict_RDD).map(
        lambda r: ((r[0], r[1]), r[2]))

    rates_and_preds = test_RDD.map(
        lambda r: ((int(r[0]), int(r[1])), float(r[2]))).join(predictions)
    error = math.sqrt(
        rates_and_preds.map(lambda r: (r[1][0] - r[1][1])**2).mean())
    print 'For testing data the RMSE is %s' % (error)

    result = sc.parallelize([('best_rank', best_rank),
                             ('best_iteration', best_iteration),
                             ('best_regularizer', best_regularizer),
                             ('train_error', min_error),
                             ('test_error', error)])
    outdata = result.map(lambda o_str: u"%s : %s" %
                         (o_str[0], o_str[1])).coalesce(1)
    outdata.saveAsTextFile(outdir)
def clean_str(str):
    import re
    str = str + " "
    str = re.sub("http[^ ]*[\\\]", "\\\\", str)  #Remove hyperlinks
    str = re.sub("http[^ ]* ", " ", str)  #Remove hyperlinks
    str = str.replace('\\n', ' ')
    arr = re.findall(
        r"\w+(?:[-']\w+)*|'|[:)-.(]+|\S\w*",
        str)  #Single punctuation mark is removed, smileys remain intact
    arr = [i for i in arr if len(i) > 1 and i[0] != '@'
           ]  #Remove words starting with @ (Twitter mentions)
    arr = [i if i[0] != '#' else i[1:] for i in arr]  #Remove '#' from hashtags
    #arr=[i for i in arr if i!='http' and i!='com' and i!='org']
    res = " ".join(arr)
    return res.lower().strip()


fp, out = sys.argv[1].split(',')

sc = pyspark_cassandra.CassandraSparkContext()

data = sc.textFile(fp, 36)

clean_text = data.map(json.loads) \
                 .map(lambda x: (x, clean_str(x['text'])))

json_preds = clean_text.map(lambda x: (x[0], predictTweet(x[1])['pos'])) \
                       .map(json.dumps)

json_preds.saveAsTextFile(out)
Example #12
0
def main():
    in_path = sys.argv[1]
    #outdir = sys.argv[2]

    cluster_seeds = ['199.60.17.136', '199.60.17.173']
    conf = SparkConf().set('spark.cassandra.connection.host',
                           ','.join(cluster_seeds))
    sc = pyspark_cassandra.CassandraSparkContext(conf=conf)
    sqlContext = SQLContext(sc)

    # Loading best parameters found in model_selection.py
    text = sc.textFile(in_path)
    text_processed = text.map(lambda line: line.split(":")).cache()
    best_rank = text_processed.filter(lambda w: str(w[0]).strip(
    ) == 'best_rank').map(lambda w: w[1]).collect()
    best_rank_int = int(best_rank[0])
    best_iteration = text_processed.filter(lambda w: str(w[0]).strip() == 'best_iteration')\
        .map(lambda w: w[1]).collect()
    best_iter_int = int(best_iteration[0])
    best_regularizer = text_processed.filter(lambda w: str(w[0]).strip() == 'best_regularizer')\
        .map(lambda w: w[1]).collect()
    best_reg_float = float(best_regularizer[0])

    # Loading yelp_business data
    yelp_business = sc.cassandraTable("bigbang", "yelp_business").\
        select('business_id', 'city', 'full_address', 'latitude', 'longitude', 'name', 'open', 'review_count', 'stars', 'state')\
        .cache()

    # Loading yelp_review data
    yelp_review = sc.cassandraTable("bigbang", "yelp_review").select('user_id','business_id','stars')\
        .map(lambda r: (r['user_id'],r['business_id'],r['stars'])).cache()

    # mapping string user_ids to int values
    user_id_lst = list(set(yelp_review.map(lambda r: r[0]).collect()))
    user_dict = map_string_to_int(user_id_lst)

    # mapping string business_ids to int values
    business_id_lst = list(set(yelp_review.map(lambda r: r[1]).collect()))
    business_dict = map_string_to_int(business_id_lst)

    #Converting the string user_id and business_id in ratings/review data into integer values using dictionaries
    ratings = yelp_review.map(
        lambda r: (user_dict[r[0]], business_dict[r[1]], r[2])).cache()

    # Train the model with the best parameters found in model_selection.py
    seed = 5L
    model = ALS.train(ratings,
                      best_rank_int,
                      best_iter_int,
                      lambda_=best_reg_float,
                      seed=seed)

    #Preparing distinct list of user_ids for predicting ratings for.
    ratings_df = ratings.toDF()
    ratings_df.registerTempTable('ratingsTable')
    user_distinct_df = sqlContext.sql(
        """SELECT distinct(_1) as user_id FROM ratingsTable LIMIT 10""")
    user_distinct_df.registerTempTable('userDistinctTable')

    # Preparing distinct list of business_ids for predicting ratings for.
    business_distinct_df = sqlContext.sql(
        """SELECT distinct(_2) as business_id FROM ratingsTable""")
    business_distinct_df.registerTempTable('businessDistinctTable')

    # Joining list of user_ids and business_ids.
    user_movie_for_pred_df = sqlContext.sql(
        """SELECT user_id, business_id from userDistinctTable , businessDistinctTable"""
    )
    user_movie_for_pred_rdd = user_movie_for_pred_df.rdd.map(
        lambda p: (int(p[0]), int(p[1])))

    #Predicting ratings for the user_id and business_id combination retrieved above.
    predictions = model.predictAll(user_movie_for_pred_rdd).map(
        lambda r: (r[0], r[1], r[2])).cache()

    # mapping int user_ids back to their string values
    user_dict = map_int_to_string(user_dict)

    # mapping int business_ids back to their string values
    business_dict = map_int_to_string(business_dict)

    #Saving output predicted ratings to cassandra table 'user_pred_ratings'
    preds = predictions.map(lambda p:
                            (str(user_dict[p[0]]), str(business_dict[p[1]]),
                             Decimal(p[2]).quantize(decimal.Decimal('1.00'))))
    print(preds.take(5))
    #preds.saveAsTextFile(outdir)
    preds.saveToCassandra(
        'bigbang',
        'user_pred_ratings',
        columns=["user_id", "business_id", "user_pred_rating"],
        parallelism_level=1000)
Example #13
0
def main():
    cluster_seeds = ['199.60.17.136', '199.60.17.173']
    conf = SparkConf().set('spark.cassandra.connection.host',
                           ','.join(cluster_seeds))
    sc = pyspark_cassandra.CassandraSparkContext(conf=conf)
    sqlContext = HiveContext(sc)

    # Loading yelp_business data
    yelp_business = sc.cassandraTable("bigbang", "yelp_business").\
        select('business_id', 'city', 'full_address', 'latitude', 'longitude', 'name', 'open', 'review_count', 'stars', 'state')\
        .toDF()
    yelp_business.registerTempTable('business')

    # Loading yelp_user data
    yelp_user = sc.cassandraTable("bigbang",
                                  "yelp_user").select('user_id', 'name',
                                                      'yelping_since').toDF()
    yelp_user.registerTempTable('user')

    # Loading user_pred_ratings data
    user_pred_ratings = sc.cassandraTable("bigbang", "user_pred_ratings").\
       select('user_id', 'business_id', 'user_pred_rating').toDF()
    user_pred_ratings.registerTempTable('userPredRatings')

    # Loading checkin data
    business_checkin = sc.cassandraTable("bigbang", "yelp_business_checkin")\
        .select('business_id', 'checkin','day','hour').toDF()
    business_checkin.registerTempTable('businessCheckin')

    # Joining user, business and ratings to get denormalized data
    user_res_pred_denorm_df = sqlContext.sql(
        """SELECT a.user_id as user_id, b.city as city, a.business_id as business_id, b.full_address as full_address,
       b.latitude as latitude, b.longitude as longitude, b.name as business_name, b.open as open,
       b.review_count as review_count, b.stars as stars, b.state as state, c.name as user_name,
       a.user_pred_rating as user_pred_rating
       from userPredRatings a inner join business b
       on a.business_id = b.business_id
       inner join user c
       on a.user_id = c.user_id""")

    # user_res_pred_denorm_df.registerTempTable('userResPredDenorm')
    #
    # recommendations_top10_df = sqlContext.sql(
    #    """SELECT user_id, city, business_id, full_address, latitude, longitude, business_name, open, review_count,
    #    stars, state, user_name, user_pred_rating FROM
    #     (SELECT user_id, city, business_id, full_address, latitude, longitude, business_name, open, review_count, stars,
    #     state, user_name, user_pred_rating, row_number() OVER (PARTITION BY user_id, state ORDER BY user_pred_rating desc) rn
    #     FROM userResPredDenorm) temp
    #     WHERE temp.rn <= 20""")

    #Retrieving top 20 restaurants per user per state using Window Analytic Functions
    windowSpec = Window.partitionBy(user_res_pred_denorm_df['user_id'],user_res_pred_denorm_df['state']).\
        orderBy(user_res_pred_denorm_df['user_pred_rating'].desc())

    rn_business = (func.rowNumber().over(windowSpec))
    recommendations_ranked_df = user_res_pred_denorm_df.select(
        user_res_pred_denorm_df['user_id'], user_res_pred_denorm_df['city'],
        user_res_pred_denorm_df['business_id'],
        user_res_pred_denorm_df['full_address'],
        user_res_pred_denorm_df['latitude'],
        user_res_pred_denorm_df['longitude'],
        user_res_pred_denorm_df['business_name'],
        user_res_pred_denorm_df['open'],
        user_res_pred_denorm_df['review_count'],
        user_res_pred_denorm_df['stars'], user_res_pred_denorm_df['state'],
        user_res_pred_denorm_df['user_name'],
        user_res_pred_denorm_df['user_pred_rating'], rn_business.alias("rn"))

    recommendations_top20_df = recommendations_ranked_df.filter(
        recommendations_ranked_df["rn"] <= 20).select(
            recommendations_ranked_df['user_id'],
            recommendations_ranked_df['city'],
            recommendations_ranked_df['business_id'],
            recommendations_ranked_df['full_address'],
            recommendations_ranked_df['latitude'],
            recommendations_ranked_df['longitude'],
            recommendations_ranked_df['business_name'],
            recommendations_ranked_df['open'],
            recommendations_ranked_df['review_count'],
            recommendations_ranked_df['stars'],
            recommendations_ranked_df['state'],
            recommendations_ranked_df['user_name'],
            recommendations_ranked_df['user_pred_rating'])
    recommendations_top20_df.registerTempTable('recommendations')

    recommendations_top20_rdd = recommendations_top20_df.rdd.map(lambda p: (
        str(uuid.uuid1()), p['user_id'], p['city'], p['business_id'], p[
            'full_address'], p['latitude'], p['longitude'], p[
                'business_name'], p['open'], p['review_count'], p['stars'], p[
                    'state'], p['user_name'], p['user_pred_rating']))
    # Loading the recommendations table in cassandra
    recommendations_top20_rdd.saveToCassandra(
        'bigbang',
        'recommendations',
        columns=[
            "uid", "user_id", "business_city", "business_id",
            "business_full_address", "business_latitude", "business_longitude",
            "business_name", "business_open", "business_review_count",
            "business_stars", "business_state", "user_name", "user_pred_rating"
        ],
        parallelism_level=1000)

    # Joining recommendations and checkin data
    user_res_checkin_denorm_df = sqlContext.sql(
        """SELECT a.user_id as user_id, a.city as business_city, a.business_id as business_id,
       a.full_address as business_full_address, a.latitude as business_latitude,
       a.longitude as business_longitude, a.business_name as business_name, a.open as business_open,
       a.review_count as business_review_count, a.stars as business_stars,
       a.state as business_state, a.user_name as user_name,
       a.user_pred_rating as user_pred_rating, b.day as day, b.hour as hour, b.checkin as checkin
       from recommendations a inner join businessCheckin b
       on a.business_id = b.business_id""")

    recommendations_checkin_rdd = user_res_checkin_denorm_df.rdd.map(
        lambda p: (str(uuid.uuid1()), p['user_id'], p['business_city'], p[
            'business_id'], p['business_full_address'], p['business_latitude'],
                   p['business_longitude'], p['business_name'], p[
                       'business_open'], p['business_review_count'],
                   p['business_stars'], p['business_state'], p['user_name'], p[
                       'user_pred_rating'], p['day'], p['hour'], p['checkin']))
    # Loading the recommendations_checkin table in cassandra
    recommendations_checkin_rdd.saveToCassandra(
        'bigbang',
        'recommendations_checkin',
        columns=[
            "uid", "user_id", "business_city", "business_id",
            "business_full_address", "business_latitude", "business_longitude",
            "business_name", "business_open", "business_review_count",
            "business_stars", "business_state", "user_name",
            "user_pred_rating", "day", "hour", "checkin"
        ],
        parallelism_level=1000)