def run(self): avro_file = self.get_opt('avro') parquet_dir = self.get_opt('parquet_dir') # let Spark fail if avro/parquet aren't available # can't check paths exist as want to remain generically portable # to HDFS, local filesystm or any other uri scheme Spark supports log.info("Avro Source: %s" % avro_file) log.info("Parquet Destination: %s" % parquet_dir) conf = SparkConf().setAppName('HS PySpark Avro => Parquet') sc = SparkContext(conf=conf) # pylint: disable=invalid-name if self.verbose < 3 and 'setLogLevel' in dir(sc): sc.setLogLevel('WARN') sqlContext = SQLContext(sc) # pylint: disable=invalid-name spark_version = sc.version log.info('Spark version detected as %s' % spark_version) if not isVersionLax(spark_version): die("Spark version couldn't be determined. " + support_msg('pytools')) # pylint: disable=invalid-name if isMinVersion(spark_version, 1.4): # this doesn't work in Spark <= 1.3 - github docs don't mention the older .method() for reading avro df = sqlContext.read.format('com.databricks.spark.avro').load( avro_file) df.write.parquet(parquet_dir) else: die('Spark <= 1.3 is not supported due to avro dependency, sorry! ' + \ 'I may change this on request but prefer people just upgrade')
def run(self): json_file = self.get_opt('json') parquet_dir = self.get_opt('parquet_dir') # let Spark fail if csv/parquet aren't available # can't check paths exist as want to remain generically portable # to HDFS, local filesystm or any other uri scheme Spark supports log.info("Json Source: %s" % json_file) log.info("Parquet Destination: %s" % parquet_dir) conf = SparkConf().setAppName('HS PySpark JSON => Parquet') sc = SparkContext(conf=conf) # pylint: disable=invalid-name if self.verbose < 3 and 'setLogLevel' in dir(sc): sc.setLogLevel('WARN') sqlContext = SQLContext(sc) # pylint: disable=invalid-name spark_version = sc.version log.info('Spark version detected as %s' % spark_version) if not isVersionLax(spark_version): die("Spark version couldn't be determined. " + support_msg('pytools')) if isMinVersion(spark_version, 1.4): df = sqlContext.read.json(json_file) # pylint: disable=invalid-name df.write.parquet(parquet_dir) else: log.warn('running legacy code for Spark <= 1.3') df = sqlContext.jsonFile(json_file) # pylint: disable=invalid-name df.saveAsParquetFile(parquet_dir)
def run(self): self.no_args() json_file = self.options.json avro_dir = self.options.avro_dir # let Spark fail if json/avro dir aren't available # can't check paths exist as want to remain generically portable # to HDFS, local filesystm or any other uri scheme Spark supports log.info("Json Source: %s" % json_file) log.info("Avro Destination: %s" % avro_dir) conf = SparkConf().setAppName('HS PySpark Json => Avro') sc = SparkContext(conf=conf) # pylint: disable=invalid-name sqlContext = SQLContext(sc) # pylint: disable=invalid-name spark_version = sc.version log.info('Spark version detected as %s' % spark_version) if not isVersionLax(spark_version): die("Spark version couldn't be determined. " + support_msg('pytools')) # pylint: disable=invalid-name df = None if isMinVersion(spark_version, 1.4): df = sqlContext.read.json(json_file) else: die('Spark <= 1.3 is not supported due to avro dependency, sorry! ' + \ 'I may change this on request but prefer people just upgrade') # log.warn('running legacy code for Spark <= 1.3') #json = sqlContext.jsonFile(json_file) # this doesn't work in Spark <= 1.3 and the github docs don't mention the older methods for writing avro using # the databricks avro driver df.write.format('com.databricks.spark.avro').save(avro_dir)
def run(self): self.no_args() json_file = self.get_opt('json') avro_dir = self.get_opt('avro_dir') # let Spark fail if json/avro dir aren't available # can't check paths exist as want to remain generically portable # to HDFS, local filesystm or any other uri scheme Spark supports log.info("Json Source: %s" % json_file) log.info("Avro Destination: %s" % avro_dir) conf = SparkConf().setAppName('HS PySpark Json => Avro') sc = SparkContext(conf=conf) # pylint: disable=invalid-name if self.verbose < 3 and 'setLogLevel' in dir(sc): sc.setLogLevel('WARN') sqlContext = SQLContext(sc) # pylint: disable=invalid-name spark_version = sc.version log.info('Spark version detected as %s' % spark_version) if not isVersionLax(spark_version): die("Spark version couldn't be determined. " + support_msg('pytools')) # pylint: disable=invalid-name df = None if isMinVersion(spark_version, 1.4): df = sqlContext.read.json(json_file) else: die('Spark <= 1.3 is not supported due to avro dependency, sorry! ' + \ 'I may change this on request but prefer people just upgrade') # log.warn('running legacy code for Spark <= 1.3') #json = sqlContext.jsonFile(json_file) # this doesn't work in Spark <= 1.3 and the github docs don't mention the older methods for writing avro using # the databricks avro driver df.write.format('com.databricks.spark.avro').save(avro_dir)
def run(self): parquet_file = self.get_opt('parquet') avro_dir = self.get_opt('avro_dir') # let Spark fail if avro/parquet aren't available # can't check paths exist as want to remain generically portable # to HDFS, local filesystm or any other uri scheme Spark supports log.info("Parquet Source: %s" % parquet_file) log.info("Avro Destination: %s" % avro_dir) conf = SparkConf().setAppName('HS PySpark Parquet => Avro') sc = SparkContext(conf=conf) # pylint: disable=invalid-name sqlContext = SQLContext(sc) # pylint: disable=invalid-name spark_version = sc.version log.info('Spark version detected as %s' % spark_version) if not isVersionLax(spark_version): die("Spark version couldn't be determined. " + support_msg('pytools')) # pylint: disable=invalid-name if isMinVersion(spark_version, 1.4): # this doesn't work in Spark <= 1.3 - github docs don't mention the older .method() for writing avro df = sqlContext.read.parquet(parquet_file) df.write.format('com.databricks.spark.avro').save(avro_dir) else: die('Spark <= 1.3 is not supported due to avro dependency, sorry! ' + \ 'I may change this on request but prefer people just upgrade')
def run(self): json_file = self.get_opt('json') parquet_dir = self.get_opt('parquet_dir') # let Spark fail if csv/parquet aren't available # can't check paths exist as want to remain generically portable # to HDFS, local filesystm or any other uri scheme Spark supports log.info("Json Source: %s" % json_file) log.info("Parquet Destination: %s" % parquet_dir) conf = SparkConf().setAppName('HS PySpark JSON => Parquet') sc = SparkContext(conf=conf) # pylint: disable=invalid-name sqlContext = SQLContext(sc) # pylint: disable=invalid-name spark_version = sc.version log.info('Spark version detected as %s' % spark_version) if not isVersionLax(spark_version): die("Spark version couldn't be determined. " + support_msg('pytools')) if isMinVersion(spark_version, 1.4): df = sqlContext.read.json(json_file) # pylint: disable=invalid-name df.write.parquet(parquet_dir) else: log.warn('running legacy code for Spark <= 1.3') df = sqlContext.jsonFile(json_file) # pylint: disable=invalid-name df.saveAsParquetFile(parquet_dir)
def run(self): csv_file = self.options.csv avro_dir = self.options.avro_dir has_header = self.options.has_header # I don't know why the Spark guys made this a string instead of a bool header_str = 'false' if has_header: header_str = 'true' schema = self.options.schema # let Spark fail if csv/avro dir aren't available # can't check paths exist as want to remain generically portable # to HDFS, local filesystm or any other uri scheme Spark supports log.info("CSV Source: %s" % csv_file) log.info("Avro Destination: %s" % avro_dir) if schema: def get_type(arg): arg = str(arg).lower() if arg not in self.types_mapping: self.usage("invalid type '%s' defined in --schema, must be one of: %s" % (arg, ', '.join(sorted(self.types_mapping.keys())))) # return self.types_mapping[arg] module = __import__('pyspark.sql.types', globals(), locals(), ['types'], -1) class_ = getattr(module, self.types_mapping[arg]) _ = class_() return _ def create_struct(arg): name = arg data_type = 'string' if ':' in arg: (name, data_type) = arg.split(':', 1) data_class = get_type(data_type) return StructField(name, data_class, True) # see https://github.com/databricks/spark-csv#python-api self.schema = StructType([create_struct(_) for _ in schema.split(',')]) log.info('generated CSV => Spark schema') conf = SparkConf().setAppName('HS PySpark CSV => Avro') sc = SparkContext(conf=conf) # pylint: disable=invalid-name sqlContext = SQLContext(sc) # pylint: disable=invalid-name spark_version = sc.version log.info('Spark version detected as %s' % spark_version) if not isVersionLax(spark_version): die("Spark version couldn't be determined. " + support_msg('pytools')) # pylint: disable=invalid-name df = None if isMinVersion(spark_version, 1.4): if has_header and not schema: log.info('inferring schema from CSV headers') df = sqlContext.read.format('com.databricks.spark.csv')\ .options(header=header_str, inferschema='true')\ .load(csv_file) else: log.info('using explicitly defined schema') schema = self.schema df = sqlContext.read\ .format('com.databricks.spark.csv')\ .options(header=header_str)\ .load(csv_file, schema=schema) else: die('Spark <= 1.3 is not supported due to avro dependency, sorry! ' + \ 'I may change this on request but prefer people just upgrade') # log.warn('running legacy code for Spark <= 1.3') # if has_header and not schema: # log.info('inferring schema from CSV headers') # df = sqlContext.load(source="com.databricks.spark.csv", path=csv_file, # header=header_str, inferSchema='true') # elif self.schema: # log.info('using explicitly defined schema') # df = sqlContext.load(source="com.databricks.spark.csv", path=csv_file, # header=header_str, schema=self.schema) # else: # die('no header and no schema, caught late') # this doesn't work in Spark <= 1.3 and the github docs don't mention the older methods for writing avro using # the databricks avro driver df.write.format('com.databricks.spark.avro').save(avro_dir)
def run(self): csv_file = self.get_opt('csv') parquet_dir = self.get_opt('parquet_dir') has_header = self.get_opt('has_header') # I don't know why the Spark guys made this a string instead of a bool header_str = 'false' if has_header: header_str = 'true' schema = self.get_opt('schema') # let Spark fail if csv/parquet aren't available # can't check paths exist as want to remain generically portable # to HDFS, local filesystm or any other uri scheme Spark supports log.info("CSV Source: %s" % csv_file) log.info("Parquet Destination: %s" % parquet_dir) if schema: def get_type(arg): arg = str(arg).lower() if arg not in self.types_mapping: self.usage( "invalid type '%s' defined in --schema, must be one of: %s" % (arg, ', '.join(sorted(self.types_mapping.keys())))) # return self.types_mapping[arg] module = __import__('pyspark.sql.types', globals(), locals(), ['types'], -1) class_ = getattr(module, self.types_mapping[arg]) _ = class_() return _ def create_struct(arg): name = str(arg).strip() data_type = 'string' if ':' in arg: (name, data_type) = arg.split(':', 1) data_class = get_type(data_type) return StructField(name, data_class, True) # see https://github.com/databricks/spark-csv#python-api self.schema = StructType( [create_struct(_) for _ in schema.split(',')]) log.info('generated CSV => Spark schema') conf = SparkConf().setAppName('HS PySpark CSV => Parquet') sc = SparkContext(conf=conf) # pylint: disable=invalid-name sqlContext = SQLContext(sc) # pylint: disable=invalid-name spark_version = sc.version log.info('Spark version detected as %s' % spark_version) if not isVersionLax(spark_version): die("Spark version couldn't be determined. " + support_msg('pytools')) # pylint: disable=invalid-name df = None if isMinVersion(spark_version, 1.4): if has_header and not schema: log.info('inferring schema from CSV headers') df = sqlContext.read.format('com.databricks.spark.csv')\ .options(header=header_str, inferschema='true')\ .load(csv_file) else: log.info('using explicitly defined schema') df = sqlContext.read\ .format('com.databricks.spark.csv')\ .options(header=header_str)\ .load(csv_file, schema=self.schema) df.write.parquet(parquet_dir) else: log.warn('running legacy code for Spark <= 1.3') if has_header and not schema: log.info('inferring schema from CSV headers') df = sqlContext.load(source="com.databricks.spark.csv", path=csv_file, header=header_str, inferSchema='true') elif self.schema: log.info('using explicitly defined schema') schema = self.schema df = sqlContext.load(source="com.databricks.spark.csv", path=csv_file, header=header_str, schema=schema) else: die('no header and no schema, caught late') df.saveAsParquetFile(parquet_dir)