forked from HariSekhon/DevOps-Python-tools
/
spark-json-to-parquet.py
executable file
·98 lines (84 loc) · 3.27 KB
/
spark-json-to-parquet.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
#!/usr/bin/env python
# vim:ts=4:sts=4:sw=4:et
#
# Author: Hari Sekhon
# Date: 2015-11-03 21:38:52 +0000 (Tue, 03 Nov 2015)
#
# https://github.com/harisekhon/pytools
#
# License: see accompanying Hari Sekhon LICENSE file
#
# If you're using my code you're welcome to connect with me on LinkedIn and optionally send me feedback to help improve or steer this or other code I publish
#
# http://www.linkedin.com/in/harisekhon
#
"""
PySpark program to convert JSON file(s) to Parquet
Written to work across Python 2.x and Spark versions, especially Spark given that the Spark API changed after 1.3
"""
from __future__ import print_function
__author__ = 'Hari Sekhon'
__version__ = '0.3.3'
import glob
import logging
import os
import sys
# using optparse rather than argparse for servers still on Python 2.6
from optparse import OptionParser
sys.path.append(os.path.join(os.path.dirname(__file__), 'pylib'))
try:
from harisekhon.utils import *
except ImportError, e:
print('module import failed: %s' % e, file=sys.stderr)
sys.exit(4)
spark_home = os.getenv('SPARK_HOME', None)
if spark_home:
# doesn't contain py4j may as well just use the already unpacked version
#sys.path.append(os.path.join(spark_home, 'python/lib/pyspark.zip'))
sys.path.append(os.path.join(spark_home, 'python'))
# more abstract without version number but not available in spark bin download
#sys.path.append(os.path.join(spark_home, 'python/build'))
for x in glob.glob(os.path.join(spark_home, 'python/lib/py4j-*-src.zip')):
sys.path.append(x)
else:
warn("SPARK_HOME not set - probably won't find PySpark libs")
try:
from pyspark import SparkContext
from pyspark import SparkConf
from pyspark.sql import SQLContext
except ImportError, e:
print('module import failed: %s' % e, file=sys.stderr)
sys.exit(4)
def main():
log = logging.getLogger(prog)
log.setLevel(logging.INFO)
# bit hackish and hard to keep aligned with docstring changes, not using this
# usage = '\r\b\r\b\r' + __doc__ + "usage: %prog -j file.json -p directory.parquet"
# parser = OptionParser(usage=usage, version='%prog ' + __version__)
parser = OptionParser(version='%prog ' + __version__)
parser.add_option('-j', '--json', dest='jsonFile', help='JSON input file/dir', metavar='<file/dir>')
parser.add_option('-p', '--parquetDir', dest='parquetDir', help='Parquet output dir', metavar='<dir>')
(options, args) = parser.parse_args()
jsonFile = options.jsonFile
parquetDir = options.parquetDir
if args or not jsonFile or not parquetDir:
usage(parser)
conf = SparkConf().setAppName('HS PySpark JSON => Parquet')
sc = SparkContext(conf=conf)
sqlContext = SQLContext(sc)
spark_version = sc.version
log.info('Spark version detected as %s' % spark_version)
if not isVersionLax(spark_version):
die("Spark version couldn't be determined. " + support_msg('pytools'))
if isMinVersion(spark_version, 1.4):
json = sqlContext.read.json(jsonFile)
json.write.parquet(parquetDir)
else:
log.warn('running legacy code for Spark <= 1.3')
json = sqlContext.jsonFile(jsonFile)
json.saveAsParquetFile(parquetDir)
if __name__ == '__main__':
try:
main()
except KeyboardInterrupt:
pass