Beispiel #1
0
 def main(self, sc, *args):
     sql_context = SQLContext(sc)
     df = self.read_from_hdfs(sql_context)
     df = self.count_occurrence(df)
     self.write_to_hdfs(df)
Beispiel #2
0
from dataMining.checkDB import checkStatus
from dataMining.databaseInit import initializeDatabase
from dataMining.mongoQuery import askMongo
from flask_uploads import UploadSet, configure_uploads
from flask import Flask, render_template, request, send_file



findspark.init()

app = Flask(__name__)
app.config.from_pyfile('config.py')
sc = SparkContext(appName="Yelp")
sc.setLogLevel("ERROR")

sqlc = SQLContext(sc)

with open('../data/Charlotte_Restaurants_review.pickle', 'rb') as f:
    all_visited = pickle.load(f)

with open('../data/Charlotte_Restaurants_business.pickle', 'rb') as f:
    rest = pickle.load(f)

with open('../data/Charlotte_Restaurants_user.pickle', 'rb') as f:
    user = pickle.load(f)

n_business = len(rest)

best_model = ALSModel.load('../data/Charlotte_als_model')

def main_acr_preprocess():
    # def main():

    parameter = load_json_config("./parameter.json")
    list_args = parameter["acr_preprocess"]
    DATA_DIR = parameter["DATA_DIR"]
    path_pickle = DATA_DIR + list_args["path_pickle"]
    path_tf_record = DATA_DIR + list_args["path_tf_record"]
    input_word_embeddings_path = DATA_DIR + list_args[
        "input_word_embeddings_path"]
    vocab_most_freq_words = list_args["vocab_most_freq_words"]
    max_words_length = list_args["max_words_length"]
    output_word_vocab_embeddings_path = DATA_DIR + list_args[
        "output_word_vocab_embeddings_path"]
    output_label_encoders = DATA_DIR + list_args["output_label_encoders"]
    output_tf_records_path = DATA_DIR + list_args["output_tf_records_path"]
    output_articles_csv_path_preprocessed = DATA_DIR + list_args[
        "output_articles_csv_path_preprocessed"]
    output_articles_csv_path_original = DATA_DIR + list_args[
        "output_articles_csv_path_original"]
    articles_by_tfrecord = list_args["articles_by_tfrecord"]
    mysql_host = list_args["mysql_host"]
    mysql_user = list_args["mysql_user"]
    mysql_passwd = list_args["mysql_passwd"]
    mysql_database = list_args["mysql_database"]
    mysql_table = list_args["mysql_table"]
    domain = list_args["domain"]

    print("<=== STARTING ARC PREPROCESS ===>")
    spark = spark_inital()
    sc = spark.sparkContext
    sqlContext = SQLContext(sc)

    # ACR PREPROCESS
    if path.exists(output_label_encoders):
        pass
    else:
        os.makedirs(output_label_encoders)
        os.makedirs(output_word_vocab_embeddings_path)
        os.makedirs(output_articles_csv_path_preprocessed)
        os.makedirs(output_articles_csv_path_original)

    if path.exists(path_tf_record):
        pass
    else:
        os.makedirs(path_tf_record)

    list_args_2 = parameter["acr_training"]
    acr_path = DATA_DIR + list_args_2["output_acr_metadata_embeddings_path"]
    acr_label_encoders, articles_metadata_df, content_article_embeddings = deserialize(
        get_all_file(acr_path)[0])
    isEmpty = 0
    # DATABASE NEWS FOMR MYSQL
    news_df_from_mysql = handle_database_news(domain, mysql_host, mysql_user,
                                              mysql_passwd, mysql_database,
                                              mysql_table,
                                              list_args['date_start'],
                                              list_args["date_end"])

    # filter
    list_cul = [
        'catId', 'content', 'email', 'newsId', 'publishDate', 'sapo',
        'sourceNews', 'tags', 'title', 'url'
    ]
    news_df_from_mysql = news_df_from_mysql[news_df_from_mysql["sourceNews"] ==
                                            "CafeBiz"]
    news_df_from_mysql = news_df_from_mysql.dropna()

    # remove news deleted
    news_df_from_mysql = news_df_from_mysql[news_df_from_mysql["is_deleted"] ==
                                            0]

    news_df_from_mysql = news_df_from_mysql[list_cul]

    if news_df_from_mysql.empty:  # if empty
        isEmpty = 1
        return isEmpty

    news_df_handle_by_spark = handle_database_news_by_spark(
        news_df_from_mysql, spark)

    news_df = remove_duplicate_newsid(news_df_handle_by_spark,
                                      acr_label_encoders)

    if news_df.empty:  # if empty
        isEmpty = 1
        return isEmpty

    # fill 0 for NaN values entities (person, location)
    # news_df.fillna("0", inplace=True)
    news_df = custome_df(news_df)
    print("Saving news articles csv original CSV to ")
    write_articale_csv_original(news_df, output_articles_csv_path_original)

    if len(os.listdir(path_pickle)) == 0:  # empty first time
        print("File Chua Da Ton Tai")
        print('Encoding categorical features')
        cat_features_encoders, labels_class_weights = process_cat_features(
            news_df)

        # write_dict_newsid_encode(cat_features_encoders["article_id"])

        print('Exporting LabelEncoders of categorical features: {}'.format(
            output_label_encoders))
        save_article_cat_encoders(
            output_label_encoders + "acr_label_encoders.pickle",
            cat_features_encoders, labels_class_weights)

        print("Saving news articles CSV to {}".format(
            output_articles_csv_path_preprocessed))
        # news_df.to_csv( output_articles_csv_path_preprocessed +"cafebiz_articles.csv", index=False)
        news_df.to_csv(output_articles_csv_path_preprocessed +
                       "cafebiz_articles.csv",
                       index=False)

        print('Tokenizing articles...')
        tokenized_articles = tokenize_articles(
            news_df['text_highlights'].values,
            tokenization_fn=get_tkn_fn(max_words_length))

        print('Computing word frequencies...')
        words_freq = get_words_freq(tokenized_articles)

        print(
            "Loading word2vec model and extracting words of this corpus' vocabulary..."
        )
        w2v_model = Singleton.getInstance(input_word_embeddings_path)
        word_vocab, word_embeddings_matrix = process_word_embedding_for_corpus_vocab(
            w2v_model, words_freq, vocab_most_freq_words)

        print('Saving word embeddings and vocab.: {}'.format(
            output_word_vocab_embeddings_path))
        save_word_vocab_embeddings(
            output_word_vocab_embeddings_path +
            "acr_word_vocab_embeddings.pickle", word_vocab,
            word_embeddings_matrix)

        print('Converting tokens to int numbers (according to the vocab.)...')
        texts_int, texts_lengths = convert_tokens_to_int(
            tokenized_articles, word_vocab)
        news_df['text_length'] = texts_lengths
        news_df['text_int'] = texts_int

        data_to_export_df = news_df[[
            'id',
            'url',  # For debug
            'id_encoded',
            'category0_encoded',
            # 'category1_encoded',
            'keywords_encoded',
            # 'author_encoded',
            # 'concepts_encoded',
            # 'entities_encoded',
            'locations_encoded',
            'persons_encoded',
            'created_at_ts',
            'text_length',
            'text_int'
        ]]

        print("Category 0:", news_df["category0_encoded"].unique())
        for k, v in labels_class_weights.items():
            print("Label class weight shape:", k, ":", v.shape)
        print('Exporting tokenized articles to TFRecords: {}'.format(
            path_tf_record))
        export_dataframe_to_tf_records(data_to_export_df,
                                       make_sequence_example,
                                       output_path=output_tf_records_path,
                                       examples_by_file=articles_by_tfrecord)

    else:  # not empty run more than one time
        print("File Da Ton Tai")
        print("Database have new article")
        print("Call singelton ACR content: ")

        #Load ACR content
        # #1
        # from pick_singleton.pick_singleton import ACR_Pickle_Singleton
        # acr_content = ACR_Pickle_Singleton.getInstance()
        # acr_label_encoders = acr_content.acr_label_encoders

        #2

        # dict_news_id_encode = load_dict_news_id_encode()
        # word_vocab, word_embeddings_matrix = load_acr_preprocessing_word_embedding(get_file_max_date(get_all_file( output_word_vocab_embeddings_path)))

        print('Encoding categorical features')
        cat_features_encoders, labels_class_weights = process_cat_features_second_time(
            news_df, acr_label_encoders)

        # append_dict_newsid_encode(cat_features_encoders["article_id"])
        # write_dict_newsid_encode(cat_features_encoders["article_id"])

        # print('Exporting LabelEncoders of categorical features: {}'.format( output_label_encoders))
        # save_article_cat_encoders( output_label_encoders +"acr_label_encoders.pickle", cat_features_encoders,
        #                               labels_class_weights)

        print("Saving news articles CSV to {}".format(
            output_articles_csv_path_preprocessed))
        path_csv = get_all_file(output_articles_csv_path_preprocessed)[0]
        df = pd.read_csv(path_csv)
        frames = [df, news_df]

        result = pd.concat(frames, ignore_index=True)
        result.to_csv(output_articles_csv_path_preprocessed +
                      "cafebiz_articles.csv",
                      index=False)

        print('Tokenizing articles...')
        tokenized_articles = tokenize_articles(
            news_df['text_highlights'].values,
            tokenization_fn=get_tkn_fn(max_words_length))

        print('Computing word frequencies...')
        words_freq = get_words_freq(tokenized_articles)

        print(
            "Loading word2vec model and extracting words of this corpus' vocabulary..."
        )
        w2v_model = Singleton.getInstance(input_word_embeddings_path)
        word_vocab, word_embeddings_matrix = process_word_embedding_for_corpus_vocab(
            w2v_model, words_freq, vocab_most_freq_words)

        print('Saving word embeddings and vocab.: {}'.format(
            output_word_vocab_embeddings_path))
        # save_word_vocab_embeddings( output_word_vocab_embeddings_path +"acr_word_vocab_embeddings.pickle",
        #                                word_vocab, word_embeddings_matrix)

        print('Converting tokens to int numbers (according to the vocab.)...')
        texts_int, texts_lengths = convert_tokens_to_int_second_time(
            tokenized_articles, word_vocab)
        news_df['text_length'] = texts_lengths
        news_df['text_int'] = texts_int

        data_to_export_df = news_df[[
            'id',
            'url',  # For debug
            'id_encoded',
            'category0_encoded',
            # 'category1_encoded',
            'keywords_encoded',
            # 'author_encoded',
            # 'concepts_encoded',
            # 'entities_encoded',
            'locations_encoded',
            'persons_encoded',
            'created_at_ts',
            'text_length',
            'text_int'
        ]]

        print("Category 0:", news_df["category0_encoded"].unique())
        for k, v in labels_class_weights.items():
            print("Label class weight shape:", k, ":", v.shape)
        print('Exporting tokenized articles to TFRecords: {}'.format(
            path_tf_record))
        print("len data_to_export_df : {}".format(len(data_to_export_df)))
        export_dataframe_to_tf_records(data_to_export_df,
                                       make_sequence_example,
                                       output_path=output_tf_records_path,
                                       examples_by_file=articles_by_tfrecord)

    print("<=== END ARC PREPROCESS ===>")
    return isEmpty
def main():
    """Main function"""

    # Get args
    args = get_args()

    # Azure credentials
    sas_token = args.sas
    storage_account_name = args.storage
    container_in = args.container_in
    container_out = args.container_out

    azure_accounts = list()
    azure_accounts.append({
        "storage": storage_account_name,
        "sas": sas_token,
        "container": container_in
    })
    azure_accounts.append({
        "storage": storage_account_name,
        "sas": sas_token,
        "container": container_out
    })

    # VM
    cores = args.vm_cores
    ram = args.vm_ram
    shuffle_partitions = args.shuffle_partitions

    # Date, country, prefix
    country = args.country
    date_string = args.date

    # Set date variables
    day_time = datetime.strptime(date_string, "%Y-%m-%d")
    year = day_time.year
    month = day_time.month
    day = day_time.day
    dayofyear = day_time.timetuple().tm_yday

    # config
    accuracy = args.accuracy

    # Path in - path out
    blob_in = f"wasbs://{container_in}@{storage_account_name}.blob.core.windows.net/preprocessed/{country}/"
    path_out = f"users_bin_hours3-v{VERSION}/{country}"

    # config spark
    conf = getSparkConfig(cores, ram, shuffle_partitions, azure_accounts)

    # set prop for handling partition columns as strings (fixes prefixes as int)
    conf.set("spark.sql.sources.partitionColumnTypeInference.enabled", "false")

    # Create spark session
    sc = SparkContext(conf=conf).getOrCreate()
    sqlContext = SQLContext(sc)
    spark = sqlContext.sparkSession

    # Init azure client
    blob_service_client = BlobServiceClient.from_connection_string(
        CONN_STRING.format(storage_account_name, sas_token))

    #  build keys, date is mandatory
    partition_key = "year={}/month={}/day={}".format(year, month, day)
    out_key = "year={}/dayofyear={}".format(year, dayofyear)
    blob_base = "{}/{}".format(path_out, out_key)

    # process
    print("process " + partition_key + " to " + blob_base)
    start_time = time.time()
    local_dir = LOCAL_PATH + out_key
    print("write temp to " + local_dir)

    # cleanup local if exists
    if (os.path.isdir(local_dir)):
        map(os.unlink,
            (os.path.join(local_dir, f) for f in os.listdir(local_dir)))

    # TODO cleanup remote if exists

    # Input dataset
    print("read dataset table")
    read_time = time.time()

    # dfs = spark.read.format("parquet").load(blob_in)

    # # apply partition filter
    # dfs_partition = dfs.where(
    #     f"(year = {year} AND month = {month} AND day = {day}  AND prefix = '{prefix}')")

    # read only partition to reduce browse time
    dfs_cur_partition = spark.read.format("parquet").load(
        f"{blob_in}/{partition_key}")

    spark_time = time.time()

    dfs_cur_partition = dfs_cur_partition.where((col('accuracy') <= accuracy)
                                                & (col('accuracy') >= 0))
    dfs_cur_partition = dfs_cur_partition.withColumn('hour',
                                                     F.hour('timestamp'))

    result_df = dfs_cur_partition.groupBy('userId').agg(
        F.countDistinct("hour").cast('int').alias('num_hours'))

    # rebuild prefix
    result_df = result_df.withColumn('prefix', result_df.userId.substr(1, 2))
    # lit partition columns in output
    result_df = result_df.withColumn('year', F.lit(year))
    #result_df = result_df.withColumn('month', F.lit(month))
    #result_df = result_df.withColumn('day', F.lit(day))
    #result_df = result_df.withColumn('dayofyear', F.lit(dayofyear))

    # write as single partition
    result_df.repartition(1).write.partitionBy("prefix").format(
        'parquet').mode("overwrite").save(local_dir + "/")

    # stats - enable only for debug!
    # num_records = result_df.count()
    # print(f"written {num_records} rows to "+local_dir)

    # if num_records == 0:
    #     raise Exception("Zero rows output")

    print("upload local data to azure")
    upload_time = time.time()

    # upload parts over states
    for fprefix in enumerate_prefixes():
        print(f"upload files for {fprefix}")
        prefix_dir = local_dir + "/prefix=" + fprefix
        prefix_key = f"{out_key}/prefix={fprefix}/"
        prefix_blob = f"{blob_base}/prefix={fprefix}"

        if (os.path.isdir(prefix_dir)):
            files = [
                filename for filename in os.listdir(prefix_dir)
                if filename.startswith("part-")
            ]

            if len(files) > 0:

                for file_local in files:
                    file_path = prefix_dir + "/" + file_local
                    part_num = int(file_local.split('-')[1])
                    part_key = '{:05d}'.format(part_num)
                    # fix name as static hash to be reproducible
                    filename_hash = hashlib.sha1(
                        str.encode(prefix_key + part_key)).hexdigest()

                    blob_key = "{}/part-{}-{}.snappy.parquet".format(
                        prefix_blob, part_key, filename_hash)

                    print("upload " + file_path + " to " + container_out +
                          ":" + blob_key)

                    blob_client = blob_service_client.get_blob_client(
                        container_out, blob_key)

                    with open(file_path, "rb") as data:
                        blob_client.upload_blob(data, overwrite=True)

                    # cleanup
                    os.remove(file_path)
            else:
                print(f"no files to upload for {fprefix}")

        else:
            print(f"missing partition for {fprefix}")

    print("--- {} seconds elapsed ---".format(int(time.time() - start_time)))
    print()
    shutdown_time = time.time()
    spark.stop()

    end_time = time.time()
    print("Done in {} seconds (read:{} spark:{} upload:{} shutdown:{})".format(
        int(end_time - start_time), int(spark_time - read_time),
        int(upload_time - spark_time), int(shutdown_time - upload_time),
        int(end_time - shutdown_time)))
    print('Done.')
Beispiel #5
0
conf.set("spark.executor.memory", "4g")
# conf.set("spark.sql.shuffle.partitions", "400")
# conf.set("spark.yarn.executor.memoryOverhead", "256m")
conf.set("spark.network.timeout", "2000")
conf.set("spark.sql.broadcastTimeout", "300000")
# conf.set("spark.dynamicAllocation.enabled","true")
# conf.set("spark.shuffle.service.enabled", "true")
# conf.set("spark.local.dir", "/yelp-dataset/spark-tmp")
conf.set("spark.driver.memory", "1g")
# conf.set("spark.driver.maxResultSize","10g")
# sc = SparkContext("local[*]", "Simple App", conf=conf)
# sc.setCheckpointDir('/tmp')
os.environ['PYSPARK_PYTHON'] = '/usr/bin/python3.6'
conf.setMaster(SPARK_URL)
sc = SparkContext(conf=conf)
sql_sc = SQLContext(sc)

# In[5]:

# users.persist()

# In[6]:

# + cat_features + geo_features

#     train_df = train_df.select(["user_id", "user_id_2"] + feature_columns + other_columns)
#     assembler = VectorAssembler(inputCols=feature_columns, outputCol="input_features")
#     features_df = assembler.transform(train_df)

#     scaler = StandardScaler(inputCol="input_features", outputCol="scaled_features",
#                             withStd=True, withMean=True)
Beispiel #6
0
    rdd_parser = sub_parsers.add_parser('rdd')
    rdd_parser.set_defaults(func=load_to_rdd)
    rdd_parser.add_argument(
        '--num',
        dest='num',
        type=int,
        default=5,
        help='Number of values to print',
    )

    rdd_parser.add_argument('--skip-header',
                            dest='skipheader',
                            choices=['Y', 'N'],
                            default='Y')

    df_parser = sub_parsers.add_parser('df')
    df_parser.set_defaults(func=load_to_df)

    return parser


parser = config_parser()

if __name__ == '__main__':
    args = parser.parse_args()

    with SparkContext(appName='Load Users From CSV') as sc:
        sqc = SQLContext(sc)
        args.func(args, sc, sqc)
from pyspark import SparkContext, SQLContext
from pyspark.sql import functions as F

spark = SparkContext("local[*]", "SQL_Example")
sc = SQLContext(spark)

df = sc.read.load("hdfs://my-hadoop-cluster-hadoop-hdfs-nn:9000/data/dataset.csv",
                  format="csv",
                  sep=",",
                  inferSchema="true",
                  header="true")

sum = df.groupBy('artist').agg(
    F.sum('timesListened').alias('timesListened')) \
    .orderBy('timesListened', ascending=True)

sum.show()

sum.write.option("truncate", "true").format("jdbc") \
    .option("url", "jdbc:mysql://my-database-service:3306/spotifydb") \
    .option("dbtable", "live_spotify") \
    .option("user", "root") \
    .option("password", "mysecretpw") \
    .mode("overwrite") \
    .save()
Beispiel #8
0
        raise


if __name__ == "__main__":
    # execute only if run as a script

    try:
        SUBMIT_ARGS = '''
                    --master local[*]
                    --driver-memory 2g
                    --packages mysql:mysql-connector-java:5.1.46 pyspark-shell
        '''
        os.environ["PYSPARK_SUBMIT_ARGS"] = SUBMIT_ARGS
        conf = SparkConf()
        sc = SparkContext(conf=conf)
        sql_c = SQLContext(sc)

        sc.setLogLevel("WARN")
        spark = sql_c.sparkSession
        logging.info("Spark Application is up")
    except Exception as e:
        logging.error(str(e))
        raise

    #FIRST AIRFLOW STAGE
    try:  #In productioun these files would be moved using the aws Boto3 library to enable the ability to move data in S3 buckets
        mkdir_p(os.getcwd() +
                "/the-movies-dataset-internal/movies_metadata/year=" +
                str(datetime.date.today().year) + "/month=" +
                str(datetime.date.today().month))
Beispiel #9
0
from pyspark import SparkContext, SparkConf, SQLContext
import sys

if __name__ == '__main__':

    conf = SparkConf().setAppName("Natalia Martir - Practica 4")
    Sp = SparkContext(conf=conf)

    headers = Sp.textFile(
        "/user/datasets/ecbdl14/ECBDL14_IR2.header").collect()
    headers = list(filter(lambda x: "@inputs" in x, headers))[0]
    headers = headers.replace(",", "").strip().split()
    del headers[0]
    headers.append("class")

    sqlc = SQLContext(Sp)
    df = sqlc.read.csv('/user/datasets/ecbdl14/ECBDL14_IR2.data',
                       header=False,
                       inferSchema=True)

    for i, colname in enumerate(df.columns):
        df = df.withColumnRenamed(colname, headers[i])

    df = df.select("PSSM_r1_0_A", "PSSM_r2_-1_S", "PSSM_central_-1_G",
                   "PSSM_r2_-1_Q", "PSSM_r1_3_E", "PSSM_r1_-1_E", "class")
    df.write.csv('./filteredC.small.training', header=True)
import os

from gensim.models import Word2Vec

# configure spark to be started with more allocated memory
memory = '12g'
pyspark_submit_args = ' --driver-memory ' + memory + ' pyspark-shell'
os.environ["PYSPARK_SUBMIT_ARGS"] = pyspark_submit_args

from pyspark import SparkConf, SparkContext, SQLContext

conf = (SparkConf().setMaster("local").setAppName("KarmaDSL").set(
    "spark.executor.cores", "8").set("spark.executor.memory", "1g"))

sc = SparkContext(conf=conf)
sql_context = SQLContext(sc)

root_dir = os.path.abspath(os.path.join(os.path.realpath(__file__), '..'))
data_dir = os.path.join(root_dir, "data/datasets")
train_model_dir = os.path.join(root_dir, "data/train_models")

# word2vec = Word2Vec.load_word2vec_format(os.path.join("/Users/minhpham/tools/", 'GoogleNews-vectors-negative300.bin'), binary=True)

file_write = open('debug.txt', 'w')

logger = sc._jvm.org.apache.log4j
logger.LogManager.getLogger("org").setLevel(logger.Level.FATAL)
logger.LogManager.getLogger("akka").setLevel(logger.Level.FATAL)
# Enable inline plotting
# %matplotlib inline / not supported in databricks use display() instead 
    
    
import os    # in case have to remove a file os.remove(<filenameS>)
 
from time import gmtime, strftime
showtime = strftime("%Y%m%d-%H%M%S", gmtime())
appName = "Chart Lab Diagnoses: " + showtime
 
if True:    # sc automatically set in databricks, set here for CDSW
  SparkConf().setMaster(value = "spark://ken-HP-EliteBook-8530w:7077")
  spark = SparkSession.builder.appName(appName).config(conf=SparkConf()).getOrCreate()
  
sc = spark.sparkContext
sqlctx = SQLContext(sc)

if False:    # snappy is the default compression
  sqlctx.setConf("spark.sql.parquet.compression.codec", "uncompressed") 



print(sc.master)
print(spark.sparkContext.getConf)
print('Python version ' + sys.version)
# print('Pandas version ' + pd.__version__)
print('Matplotlib version ' + matplotlib.__version__)


# COMMAND ----------
def load_data(path):
	from pyspark.sql import SQLContext
	sqlContext = SQLContext(sc)
	data = sqlContext.read.parquet(path)
	return data
def load_model():
	from pyspark.sql import SQLContext
	sqlContext = SQLContext(sc)
	lookup = sqlContext.read.parquet('/user/rmusters/2015model99/data').alias("lookup")
	lookup_bd = sc.broadcast(lookup.rdd.collectAsMap())
	return lookup_bd
def save_dist(df_path, fname):
	from pyspark.sql import SQLContext
	sqlContext = SQLContext(sc)
	df = sqlContext.read.parquet(df_path)
	df.save(fname, "com.databricks.spark.csv", "overwrite")
Beispiel #15
0
def signalDbDataFrameFromSignalDbRDD(sparkContext, rdd):

    sqlContext = SQLContext(sparkContext)

    return sqlContext.createDataFrame(rdd, StructType(_structFieldArray()))
Beispiel #16
0
 def main(self, sc, *args):
     sql_context = SQLContext(sc)
     df = self.read_from_hdfs(sql_context)
     self.write_to_db(df)
# import findspark
# findspark.init()

from pyspark import SparkContext
from pyspark import SQLContext
from pyspark import SparkConf

# import os

# os.environ['PYSPARK_SUBMIT_ARGS'] = "--master local[2] pyspark-shell"
# os.environ['JAVA_HOME']  = "$(/usr/libexec/java_home -v 13)"

#################### spark调用python文件: ####################
# spark-submit i0spark-submit-demo.py



sc = SparkContext("local","PySparkShell")
spark = SQLContext(sc)


lines = sc.textFile("i1introduction.md")

c = lines.count()
f = lines.first()

print('#'*20)
print(c, f)
from pyspark import SparkContext, SQLContext
import matplotlib.pyplot as plt
import pyspark.sql.functions as func
plt.rcParams["figure.figsize"] = (20,10)

import pandas as pd
try:
    sc = SparkContext("local", "Simple App")
except ValueError:
    pass


# In[2]:


sql_ctx = SQLContext(sc)


# Reading the data

# In[3]:


train_data = sql_ctx.read.csv('/home/ravibisla/PycharmProjects/DataScience/train_rating.txt', header=True)
train_data.registerTempTable('train_data')
train_data.cache()
test_data = sql_ctx.read.csv('/home/ravibisla/PycharmProjects/DataScience/test_rating.txt', header=True)
test_data.registerTempTable('test_data')
test_data.cache()
# train_review = sql_ctx.read.json('/home/ravibisla/PycharmProjects/DataScience/train_review.json')
# train_review.registerTempTable('train_review')
Beispiel #19
0
# coding: utf-8

# ## Teste de Pyspark

# In[4]:

from pyspark.sql.types import *
from pyspark import SQLContext, HiveContext, SparkContext

# In[9]:

sc = SparkContext()

# In[10]:

sqlcontext = SQLContext(sc)

# In[11]:

hive_context = HiveContext(sc)

# In[12]:

rdd = sc.parallelize(range(1000))

# In[13]:

rdd.takeSample(False, 5)

# In[1]:
Beispiel #20
0
from pyspark import SparkContext, SQLContext

sc = SparkContext(appName='emrtest')
sqlCtx = SQLContext(sc)

print sc

rdd = sc.parallelize(['1', '2', '3'])

print 'the count in rdd is ', rdd.count()

rdd.repartition(1).saveAsTextFile(
    'hdfs://ec2-52-87-197-156.compute-1.amazonaws.com/tmp/count')
Beispiel #21
0
def main():
    """Main function"""

    # Get args
    args = get_args()

    # container
    container_in = args.container_in
    container_out = args.container_out

    # Azure credentials
    sas_token = args.sas
    storage_account_name = args.storage
    azure_accounts = list()
    azure_accounts.append({
        "storage": storage_account_name,
        "sas": sas_token,
        "container": container_in
    })
    azure_accounts.append({
        "storage": storage_account_name,
        "sas": sas_token,
        "container": container_out
    })

    oauth_login = args.oauth_login
    oauth_client_id = args.oauth_client_id
    oauth_client_secret = args.oauth_client_secret

    # requires hadoop 3.2+
    # azure_oauth = {
    #     "endpoint": oauth_login,
    #     "client-id": oauth_client_id,
    #     "client-secret": oauth_client_secret
    # }
    azure_oauth = False

    # VM
    cores = args.vm_cores
    ram = args.vm_ram
    shuffle_partitions = args.shuffle_partitions

    # Date, prefix
    country = args.country
    prefix = args.prefix

    # process config
    roam_dist_stops = args.roam_dist_stops
    roam_dist_events = args.roam_dist_events

    # Path in - path out
    blob_in = f"wasbs://{container_in}@{storage_account_name}.blob.core.windows.net/stoplocation-v8_prefix_r70-s5-a70-h6/{country}/"
    timezones_in = f"wasbs://cuebiq-data@{storage_account_name}.blob.core.windows.net/utils_states_timezones/"
    if azure_oauth:
        # we can leverage abfss
        blob_in = f"abfss://{container_in}@{storage_account_name}.dfs.core.windows.net/stoplocation-v8_prefix_r70-s5-a70-h6/country={country}/"
        timezones_in = f"abfss://cuebiq-data@{storage_account_name}.dfs.core.windows.net/utils_states_timezones/"

    path_out_distinct = f"distinct_user_clusters-v8_r70-s5-a70-h6_clustered_{roam_dist_stops}m_v{VERSION}/country={country}"
    path_out_all = f"all_user_clusters-v8_r70-s5-a70-h6_clustered_{roam_dist_stops}m_v{VERSION}/country={country}"

    # config spark
    conf = getSparkConfig(cores, ram, shuffle_partitions, azure_accounts,
                          azure_oauth)

    # set prop for handling partition columns as strings (fixes prefixes as int)
    conf.set("spark.sql.sources.partitionColumnTypeInference.enabled", "false")

    # Create spark session
    sc = SparkContext(conf=conf).getOrCreate()
    sqlContext = SQLContext(sc)
    spark = sqlContext.sparkSession
    # register UDF from jar
    spark.udf.registerJavaFunction(
        "geohash", "it.smartcommunitylab.sco.mobilitycovid.udf.GeohashEncode")

    # Init azure client
    blob_service_client = BlobServiceClient.from_connection_string(
        CONN_STRING.format(storage_account_name, sas_token))

    #  build keys, date is mandatory, prefix opt
    partition_key = f"prefix={prefix}"

    print("process " + partition_key)
    start_time = time.time()
    local_dir = LOCAL_PATH + partition_key
    print("write temp to " + local_dir)

    # cleanup local if exists
    if (os.path.isdir(local_dir)):
        map(os.unlink,
            (os.path.join(local_dir, f) for f in os.listdir(local_dir)))

    # Input dataset
    print("read dataset table")
    read_time = time.time()

    # explode days manually
    dates = [datetime(2020, 1, 1) + timedelta(days=x) for x in range(0, 258)]
    blobs_in = [
        "{}/year={}/month={}/day={}/prefix={}".format(blob_in, d.year, d.month,
                                                      d.day, prefix)
        for d in dates
    ]

    #dfs = spark.read.format("parquet").load(*blobs_in)
    dfs = read_multiple_df(spark, blobs_in)
    dfs_timezones = spark.read.format("parquet").load(timezones_in)

    # manually inject prefix column
    dfs = dfs.withColumn("prefix", F.lit(prefix))

    # apply partition filter
    dfs_state = dfs.where(f"prefix = '{prefix}'")

    print("processing with spark")
    spark_time = time.time()

    w = Window().partitionBy('userId').orderBy('begin')

    dfs_state = add_distance_column(dfs_state, order_column='begin')
    dfs_state = dfs_state.fillna(0, subset=['next_travelled_distance'])
    dfs_state = dfs_state.withColumn(
        'lag_next_travelled_distance',
        F.lag(col('next_travelled_distance')).over(w))
    dfs_state = dfs_state.withColumn('lag_end', F.lag('end').over(w))
    dfs_state = dfs_state.withColumn(
        'rn',
        F.when(
            ((col('lag_next_travelled_distance') !=
              col('prev_travelled_distance')) |
             (col('prev_travelled_distance') > 0) |
             (col('lag_next_travelled_distance') > 0) |
             (col('distance_prev') > roam_dist_events) |
             ((F.dayofyear(col('begin')) - F.dayofyear(col('lag_end')) == 1) &
              (F.hour(col('begin')) < 6))) & ((col('lag_end').isNull()) |
                                              (col('lag_end') < col('begin'))),
            1).otherwise(0))
    # Remove prev_travelled distance when rn == 0 (it happens when lag_end and begin overlap)
    dfs_state = dfs_state.withColumn(
        'prev_travelled_distance',
        F.when(col('rn') == 0, 0).otherwise(col('prev_travelled_distance')))

    w = Window().partitionBy('userId').orderBy('begin').rangeBetween(
        Window.unboundedPreceding, 0)

    dfs_state = dfs_state.withColumn('group', F.sum('rn').over(w))

    dfs_state = dfs_state.groupBy('userId', 'group', 'state').agg(
        F.mean('latitude').alias('latitude'),
        F.mean('longitude').alias('longitude'),
        F.min('begin').alias('begin'),
        F.max('end').alias('end')).drop('group')

    dfs_destinations = get_destinations(dfs_state, roam_dist=roam_dist_stops)
    dfs_destinations = dfs_destinations.withColumn(
        'prefix', dfs_destinations.userId.substr(1, 2))
    dfs_destinations = dfs_destinations.withColumn('dayofyear',
                                                   F.dayofyear('begin'))
    dfs_destinations = dfs_destinations.withColumn('year', F.year('begin'))
    # dfs_destinations = dfs_destinations.withColumn('state', F.lit(state))

    # Local time
    dfs_destinations.createOrReplaceTempView("dfs_destinations")
    dfs_destinations = spark.sql("""
      SELECT dfs_destinations.*, geohash(clusterLatitude, clusterLongitude, 7) as geohash7
      from dfs_destinations
      """)
    dfs_destinations = dfs_destinations.withColumn(
        'geohash5', F.substring(col('geohash7'), 1, 5))
    dfs_destinations = dfs_destinations.join(F.broadcast(dfs_timezones),
                                             on='geohash5').drop('geohash5')
    dfs_destinations = dfs_destinations.withColumn(
        'local_begin', F.from_utc_timestamp(col('begin'), col('tzid')))
    dfs_destinations = dfs_destinations.withColumn(
        'offset',
        ((col('local_begin').cast('long') - col('begin').cast('long')) /
         3600).cast('int')).drop('local_begin')
    dfs_destinations.persist(StorageLevel.DISK_ONLY)

    # Write
    # output as country/prefix/part1..N
    local_dir_all = local_dir + "/all/"
    dfs_destinations_all = dfs_destinations.select('prefix', 'userId',
                                                   'clusterId', 'begin', 'end',
                                                   'offset', 'year',
                                                   'dayofyear')
    dfs_destinations_all.repartition(8, 'dayofyear').write.format(
        'parquet').mode('overwrite').save(local_dir_all + "prefix=" + prefix +
                                          "/")

    # output as country/prefix/state
    local_dir_distinct = local_dir + "/distinct/"
    dfs_destinations_distinct = dfs_destinations.select(
        'prefix', 'userId', 'clusterId', 'clusterLatitude', 'clusterLongitude',
        'geohash7', 'state').distinct()
    dfs_destinations_distinct.repartition("state").write.partitionBy(
        "state").format('parquet').mode('overwrite').save(local_dir_distinct +
                                                          "prefix=" + prefix +
                                                          "/")

    dfs_destinations.unpersist()

    print("upload local data to azure")
    upload_time = time.time()

    # upload parts 1  "prefix/state"
    print(f"upload files for distinct")
    # upload with threads
    dfutures = []
    with ThreadPoolExecutor(max_workers=THREADS) as executor:
        fprefix = prefix
        print(f"upload files for distinct: {fprefix}")
        prefix_dir = local_dir_distinct + "prefix=" + fprefix
        prefix_key = f"prefix={fprefix}"

        for state in US_STATES:
            s_key = f"state={state}"
            f_dir = prefix_dir + "/" + s_key
            f_key = prefix_key + "/" + s_key

            # print(f"read files for distinct from {f_dir}")

            if (os.path.isdir(f_dir)):
                files = [
                    filename for filename in os.listdir(f_dir)
                    if filename.startswith("part-")
                ]

                if len(files) > 0:

                    for file_local in files:
                        file_path = f_dir + "/" + file_local
                        part_num = int(file_local.split('-')[1])
                        part_key = '{:05d}'.format(part_num)
                        # fix name as static hash to be reproducible
                        filename_hash = hashlib.sha1(
                            str.encode(f_key + f_key + part_key)).hexdigest()

                        blob_key = "{}/{}/part-{}-{}.snappy.parquet".format(
                            path_out_distinct, f_key, part_key, filename_hash)

                        # print("upload " + file_path + " to " + container_out+":"+blob_key)
                        # upload_blob(blob_service_client,container_out, blob_key, file_path)
                        future = executor.submit(upload_blob,
                                                 blob_service_client,
                                                 container_out, blob_key,
                                                 file_path)
                        dfutures.append(future)

                # else:
                #    print(f"no files to upload for {f_key}")

            # else:
            #    print(f"missing partition for {f_key}")

        # end of loop, wait for futures
        for future in dfutures:
            bkey = future.result()

    # ensure we wait all tasks
    # TODO check if all done
    ddone = concurrent.futures.wait(dfutures)

    # upload parts 2 "prefix/parts"
    print(f"upload files for all")
    fprefix = prefix
    # upload with threads
    afutures = []
    with ThreadPoolExecutor(max_workers=THREADS) as executor:
        print(f"upload files for all: {fprefix}")
        prefix_dir = local_dir_all + "prefix=" + fprefix
        prefix_key = f"prefix={fprefix}"

        if (os.path.isdir(prefix_dir)):
            files = [
                filename for filename in os.listdir(prefix_dir)
                if filename.startswith("part-")
            ]

            if len(files) > 0:

                for file_local in files:
                    file_path = prefix_dir + "/" + file_local
                    part_num = int(file_local.split('-')[1])
                    part_key = '{:05d}'.format(part_num)
                    # fix name as static hash to be reproducible
                    filename_hash = hashlib.sha1(
                        str.encode(prefix_key + part_key)).hexdigest()

                    blob_key = "{}/{}/part-{}-{}.snappy.parquet".format(
                        path_out_all, prefix_key, part_key, filename_hash)

                    # print("upload " + file_path + " to " + container_out+":"+blob_key)
                    # upload_blob(blob_service_client,container_out, blob_key, file_path)
                    future = executor.submit(upload_blob, blob_service_client,
                                             container_out, blob_key,
                                             file_path)
                    afutures.append(future)
            # else:
            #     print(f"no files to upload for {d_key}")

            # else:
            #     print(f"missing partition for {d_key}")
        # end of loop, wait for futures
        for future in afutures:
            bkey = future.result()

    # ensure we wait all tasks
    # TODO check if all done
    adone = concurrent.futures.wait(afutures)

    print("--- {} seconds elapsed ---".format(int(time.time() - start_time)))
    print()
    shutdown_time = time.time()
    spark.stop()

    end_time = time.time()
    print("Done in {} seconds (read:{} spark:{} upload:{} shutdown:{})".format(
        int(end_time - start_time), int(spark_time - read_time),
        int(upload_time - spark_time), int(shutdown_time - upload_time),
        int(end_time - shutdown_time)))
    print('Done.')
Beispiel #22
0
from pyspark import SparkConf, SparkContext, SQLContext
conf = SparkConf().setMaster('local').setAppName('ALS')
sc = SparkContext(conf=conf)
sq = SQLContext(sc)
path = r'hdfs://localhost:9000//input/python_spark_hadoop/u.data'
rawUserData = sc.textFile(path)
rawRatings = rawUserData.map(lambda line: line.split('\t')[:3])
ratingsRDD = rawRatings.map(lambda x: (x[0], x[1], x[2]))
ratingsRDD.take(5)
# 投票数
numRatings = ratingsRDD.count()
# 用户数
numUsers = ratingsRDD.map(lambda x: x[0]).distinct().count()
# 电影数
numMovies = ratingsRDD.map(lambda x: x[1]).distinct().count()

from pyspark.mllib.recommendation import ALS
model = ALS.train(ratingsRDD, 10, 10, 0.01)
print(model)
model.recommendProducts(100, 5)
model.predict(100, 1643)
model.recommendUsers(product=200, num=5)

try:
    model.save(sc, r'file:/home/xiligey/Study/Spark/PythonSparkHadoop/ALSmodel')
except Exception as e:
    print(e)
    print('Model已经存在, 请先删除再存储或者更换目录')

from pyspark.mllib.recommendation import MatrixFactorizationModel
try:
Beispiel #23
0
from pyspark import SQLContext, SparkContext
from pyspark.sql import SparkSession
from pyspark.sql.window import Window
from pyspark.sql import  functions as func
import sys

sc = SparkContext()
sql = SQLContext(sc)
spark = SparkSession(sc)


# read product file
products = spark.read.csv('../data/products.csv', header=True)
print(products.show(3))

window_spec1 = Window.partitionBy(products['category'])\
    .orderBy(products['price'].desc())

price_rank = func.rank().over(window_spec1)
product_rank = products.select(
    products['product'],
    products['category'],
    products['price'],
).withColumn('rank', price_rank)

print(product_rank.show())

# row bbetween -1 and 1
window_spec2 = Window.partitionBy(products['category'])\
    .orderBy(products['price'].desc())\
    .rowsBetween(-1,1)
Beispiel #24
0
from pyspark import SparkConf, SparkContext, SQLContext, Row
from pyspark.sql import functions as F
APP_NAME = "count unique vistor of everyday"


def main(sc, sqlC):
    # 模拟数据
    userAccessLog = [
        "2017-01-01,1122", "2017-01-01,1122", "2017-01-01,1123",
        "2017-01-01,1124", "2017-01-01,1124", "2017-01-02,1122",
        "2017-01-01,1121", "2017-01-01,1123", "2017-01-01,1123"
    ]
    accessLogRDD = sc.parallelize(userAccessLog)
    RowRDD = accessLogRDD.map(
        lambda e: Row(e.split(",")[0], int(e.split(",")[1])))
    df = sqlC.createDataFrame(RowRDD, ['date', 'userid'])
    df.show()
    df.printSchema()
    df.groupBy('date').agg(F.countDistinct(df.userid)).show()


if __name__ == "__main__":
    conf = SparkConf().setAppName(APP_NAME).setMaster("local[*]")
    sc = SparkContext(conf=conf)
    sqlC = SQLContext(sc)

    main(sc, sqlC)
Beispiel #25
0
        sys.stderr.write(
            "Example-1: " + exeName +
            "  10          hdfs:///perfdata/freebasedeletions/* \n")
        sys.stderr.write(
            "Example-2: " + exeName +
            "  1           hdfs:///perf/data/deletions/deletions.csv-00000-of-00020\n"
        )
        exit(-1)

    print(
        str(datetime.now()) + " runCount = " + str(sys.argv[1]) + ", data = " +
        str(sys.argv[2]))

    from pyspark import SparkContext, SQLContext
    beginTime = time.time()

    sparkContext = SparkContext()
    sqlContext = SQLContext(sparkContext)

    PerfBenchmark.RunPerfSuite(FreebaseDeletionsBenchmark, sys.argv,
                               sparkContext, sqlContext)

    sparkContext.stop()

    PerfBenchmark.ReportResult()

    print(
        str(datetime.now()) + " " + os.path.basename(__file__) +
        " : Finished python version benchmark test. Whole time = " +
        ("%.3f" % (time.time() - beginTime)) + " s.")
Beispiel #26
0
def get_sql_context_instance(spark_context):

    if ('sqlContextSingletonInstance' not in globals()):
        globals()['sqlContextSingletonInstance'] = SQLContext(spark_context)

    return globals()['sqlContextSingletonInstance']
from pyspark import SparkConf, SparkContext, SQLContext
from ts.flint import FlintContext, summarizers

conf = SparkConf().setMaster("local").setAppName("TimeSeries")
sc = SparkContext(conf=conf)
sqlContext = SQLContext(sc)
flintContext = FlintContext(sqlContext)

df = spark.createDataFrame([('2018-08-20', 1.0), ('2018-08-21', 2.0),
                            ('2018-08-24', 3.0)], ['time', 'v']).withColumn(
                                'time', from_utc_timestamp(col('time'), 'UTC'))

# Convert to Flint DataFrame
flint_df = flintContext.read.dataframe(df)

# Use Spark DataFrame functionality
flint_df = flint_df.withColumn('v', flint_df['v'] + 1)

# Use Flint functionality
flint_df = flint_df.summarizeCycles(summarizers.count())
def main():
    conf = SparkConf().setAppName('housingprice')
    sc = SparkContext(conf=conf)

    sqlContext = SQLContext(sc)
    taxreportSchema = StructType([
        StructField('PID', StringType(), False),
        StructField('Legal_Type', StringType(), False),
        StructField('FOLIO', StringType(), False),
        StructField('Coordinates', StringType(), True),
        StructField('ZoneName', StringType(), True),
        StructField('ZoneCat', StringType(), True),
        StructField('LOT', StringType(), True),
        StructField('Block', StringType(), True),
        StructField('plan', StringType(), True),
        StructField('DisLot', StringType(), True),
        StructField('FCiviNum', StringType(), True),
        StructField('TCiviNum', StringType(), True),
        StructField('StreetName', StringType(), True),
        StructField('PostalCode', StringType(), True),
        StructField('NLegalName1', StringType(), True),
        StructField('NLegalName2', StringType(), True),
        StructField('NLegalName3', StringType(), True),
        StructField('NLegalName4', StringType(), True),
        StructField('NLegalName5', StringType(), True),
        StructField('CurVal', StringType(), True),
        StructField('CurImpVal', StringType(), True),
        StructField('Taxassess', StringType(), True),
        StructField('prevVal', StringType(), True),
        StructField('prevImpVal', StringType(), True),
        StructField('YearBuilt', StringType(), True),
        StructField('BigImpYear', StringType(), True),
        StructField('Tax_levy', StringType(), True),
        StructField('NeighbourhoodCode', StringType(), True),
    ])
    conversionSchema = StructType([
        StructField('date', StringType(), False),
        StructField('USD', StringType(), False),
        StructField('rate', StringType(), False),
        StructField('reciprate', StringType(), False),
    ])
    crudeoilSchema = StructType([
        StructField('date', DateType(), False),
        StructField('oilprice', StringType(), False),
    ])
    def fixdate(convVal):
        a = convVal.split(" ")
        dates = a[0].split("/")
        alldate = "20"+dates[2]+'/'+dates[0]
        return (alldate,a[1])
    def filterYear(dates):
        a = dates.split('/')
        if (a[1]=='2016'):
            return False
        else:
            return True
    def processDate(df):
        def splitMonth(cols):
         a = cols.split('/')
         return a[1]

        def splitYear(cols):
         a = cols.split('/')
         return a[0]

        fUDF = udf(splitMonth, StringType())
        df1 =  df.withColumn("month", fUDF('year'))
        fUDFyear = udf(splitYear, StringType())
        return df1.withColumn("year", fUDFyear('year'))
    #Reading the Tax Report Dataset
    taxreportinfo = sqlContext.read.format('com.databricks.spark.csv').options(header='true').schema(taxreportSchema).load(inputs+"taxreport/test")
    taxreportinfo.registerTempTable("taxreport")
    #Selecting the price,TaxAssessment Year and Postalcode of each property
    propertyVal = sqlContext.sql("SELECT CurVal, Taxassess, PostalCode FROM taxreport")
    propertyVal.registerTempTable("propertyVal")
    #Reading the CAN to USD conversion dataset
    conversion = sqlContext.read.format('com.databricks.spark.csv').options(header='true').schema(conversionSchema).load(inputs+"conversion")
    conversion.registerTempTable("Conversion")
    #Selecting only the date and rate
    conversionrate = sqlContext.sql("SELECT date,rate FROM Conversion WHERE rate regexp '^[0-9]+'")
    conversionRDD = conversionrate.repartition(40).rdd.map(lambda w: (w.date+" "+w.rate))
    conversiondates = conversionRDD.map(fixdate).filter(lambda (w,x):filterYear(w)).map(lambda l: Row(date=l[0], rate=l[1]))
    schemaConv = sqlContext.inferSchema(conversiondates)
    schemaConv.registerTempTable("ConversionDate")
    ConverDF = sqlContext.sql(" SELECT date,CAST(AVG(rate) AS DECIMAL(4,2)) as conversionrate FROM ConversionDate WHERE rate IS NOT NULL GROUP BY date")
    ConverDF.cache()
    #Reading the Canada Crude oil price dataset
    crudeoil = sc.textFile(inputs+"crudeoil")
    crudeoilRDD = crudeoil.map(lambda l: l.split()).map(lambda l: Row(date=l[0], oilprice=l[1]))
    crudeoilDF = sqlContext.inferSchema(crudeoilRDD)
    crudeoilDF.registerTempTable("crudeoil")
    #Selecting the date on M/Y format and oilprice
    oilprice = sqlContext.sql("SELECT DATE_FORMAT(date,'Y/M') as date,oilprice FROM crudeoil")
    oilprice.registerTempTable('oilprice')
    #Reading the interestrate of BC Dataset
    interestRate = sqlContext.read.format('com.databricks.spark.csv').options(header='true').load(inputs+"interestrate")
    interestRate.registerTempTable("interest")
    #Selecting the date and 5-year fixed mortgage price from the dataset
    interestDF = sqlContext.sql("SELECT DATE_FORMAT(date,'Y/M') as date,CAST(`5y-fixed-posted` AS DECIMAL(4,2)) AS interestrate FROM interest WHERE date >='2006-01' AND date <= '2015-12'")
    interestDF.registerTempTable("allrates")
    #Getting the average of each month on days whose value is not null.
    avgInterest = sqlContext.sql(" SELECT date,AVG(interestrate) as interestrates FROM allrates WHERE interestrate IS NOT NULL GROUP BY date")
    avgInterest.cache()
    joinedTable = avgInterest.join(oilprice,(avgInterest['date']==oilprice['date'])).select(avgInterest['date'],avgInterest['interestrates'],oilprice['oilprice'])
    JoinedConversion = joinedTable.join(ConverDF,(joinedTable['date']==ConverDF['date'])).select(joinedTable['date'].alias('year'),joinedTable['interestrates'],joinedTable['oilprice'],ConverDF['conversionrate'])
    JoinedConversion.registerTempTable("joinedConversion")
    ls = processDate(JoinedConversion)
    ls.show()