lookup_filename = args['lookup_filename']

sc = SparkContext()
glueContext = GlueContext(sc)
spark = glueContext.spark_session
job = Job(glueContext)
job.init(args['JOB_NAME'], args)

print 'Starting Parquet Conversion ...'
input_file = '%s/*.tar.gz' % (s3_location_source)
output_folder = s3_location_target  # With absolute path

print 'input_file= %s' % (input_file)
print 'output_folder= %s' % (output_folder)
print 'lookup_filename= %s' % (lookup_filename)

zips = sc.binaryFiles(input_file)
files_data = zips.map(extractall_tarfile)

tsv_filename = lookup_filename
output_rdd = files_data.flatMap(lambda x: [el for el in x]).filter(lambda x: x[
    0] == tsv_filename).map(lambda x: x[1]).flatMap(lambda x: x.split('\n'))

print output_rdd.count()
df = output_rdd.map(parse_log).toDF()

df.distinct().write.mode('overwrite').parquet(output_folder)
print 'Done Parquet Conversion !'
df.printSchema()
job.commit()
    return log


#----

## @params: [JOB_NAME]
args = getResolvedOptions(sys.argv, ['JOB_NAME'])

sc = SparkContext()
glueContext = GlueContext(sc)
spark = glueContext.spark_session
job = Job(glueContext)
job.init(args['JOB_NAME'], args)

filename = 's3://move-dataeng-dropbox-prod/adobe/omniture/mobilelookup/*.tar.gz'  #test_tgz/test.tar.gz' #homerealtor_20151123-000000.tar.gz' #instru_ods.tar.gz' #test_tgz.tar.gz' # mysql-connector-java-5.1.39.tar.gz' # instru_ods.tar.gz
zips = sc.binaryFiles(filename)
files_data = zips.map(extractall_tarfile)

tsv_filename_base = 'mobile_attributes'
tsv_filename = 'mobile_attributes.tsv'
output_rdd = files_data.flatMap(lambda x: [el for el in x]).filter(lambda x: x[
    0] == tsv_filename).map(lambda x: x[1]).flatMap(lambda x: x.split('\n'))

print output_rdd.count()
df = output_rdd.map(parse_log).toDF()

bucket_name = 's3://move-dataeng-temp-dev/glue-etl/omniture/lookups'
out_filename = "%s/%s" % (bucket_name, tsv_filename_base)

df.distinct().write.mode('overwrite').parquet(out_filename)
job.commit()