def import_file(json_fname, hdfs_path='/var/metlog/data'): ''' Import a JSON log file into HDFS ''' start_time = datetime.datetime.now() try: fs = PyHDFS() fname = fs.next_filename(hdfs_path) with closing(open(json_fname, 'r')) as file_in: with closing(fs.open(fname, 'w')) as writer: for line in file_in: line = line.strip() writer.append(line) print "Complete filesize: ", writer.getLength() finally: fin_time = datetime.datetime.now() delta = fin_time - start_time
from __future__ import with_statement from contextlib import closing from hdfs import PyHDFS if __name__ == '__main__': fs = PyHDFS() import datetime fname = fs.next_filename('/tmp/var/lock') print "Using: %s" % fname print "Start: %s" % datetime.datetime.now() with closing(open('sample.json.log', 'r')) as file_in: with closing(fs.open(fname, 'w')) as writer: for line in file_in: line = line.strip() writer.append(line) print "Finished: %s" % datetime.datetime.now() with closing(fs.open(fname, 'r')) as reader: for syncSeen, key, json_blob in reader: pass
def export_file(hdfs_fname): fs = PyHDFS() with closing(fs.open(hdfs_fname, 'r')) as reader: for syncSeen, key, json_blob in reader: print json_blob