Example #1
0
# Thanks to this post, this was solved:
# https://stackoverflow.com/a/46360434/8510370
#
# NOTE: Download the jdbc driver from https://jdbc.postgresql.org/download.html
#       and place it in your project or copy to all machines with environmental variables set

sparkClassPath = os.getenv('SPARK_CLASSPATH',
                           os.path.join(os.getcwd(), 'postgresql-42.2.2.jar'))

conf = SparkConf()
conf.setAppName('application')
conf.set('spark.jars', 'file:%s' % sparkClassPath)
conf.set('spark.executor.extraClassPath', sparkClassPath)
conf.set('spark.driver.extraClassPath', sparkClassPath)
# Uncomment line below and modify ip address if you need to use cluster on different IP address
# conf.set('spark.master', 'spark://127.0.0.1:7077')

sc = SparkContext(conf=conf)
sqlContext = SQLContext(sc)
url = 'jdbc:postgresql://127.0.0.1:5432/movielens'
properties = {'user': '******', 'password': '******'}

df = DataFrameReader(sqlContext).jdbc(url=url,
                                      table='movies',
                                      properties=properties)

df.select("movieid", "title").show(n=2)

movies_rdd = df.select("movieid", "title").rdd.map(tuple)
print(movies_rdd.take(5))
Example #2
0
    host, master = sys.argv[1:]
    conf.setMaster(master)
    host = sys.argv[1]
    params = utils.connection_properties(host, db='owner')
    url = 'postgresql://{host}:{port}/{db}'.format(**params)
    df = DataFrameReader(sqlContext).jdbc(url='jdbc:{}'.format(url),
                                          table='assessor.owndat',
                                          properties=params,
                                          numPartitions=8)

    df = df.withColumn(
        'adrconcat',
        concat_ws(' ', df.adrno, df.adrdir, df.adrstr, df.adrsuf, df.cityname,
                  df.statecode, df.zip1))

    func_clean_string = udf(clean_string, StringType())
    df = df.withColumn('own1', func_clean_string('own1'))
    df = df.withColumn('adrconcat', func_clean_string('adrconcat'))
    uniq_own = df.select(df.own1.alias('uniq_own')).distinct()

    df = df.withColumn('key', lit(0))
    uniq_own = uniq_own.withColumn('key', lit(0))

    a = df.select(df.own1, df.adrconcat, df.parid,
                  df.key).sample(False, .0001, 42)
    b = uniq_own.select(uniq_own.uniq_own,
                        uniq_own.key).sample(False, .0001, 42)
    c = a.join(b, a.key == b.key, 'outer')
    own_score = udf(match_score, DoubleType())
    c = c.withColumn('own_score', own_score('own1', 'uniq_own'))