class HDFS_Consumer(BaseConsumer): # given a single parsed message, add the user-photo info to the user_photos table def __init__(self, group_name, topic_name, max_tmp_size=50, timeout=15, filename="config.txt"): BaseConsumer.__init__(self, group_name, topic_name, timeout=timeout, filename=filename) self.ftmp = tempfile.NamedTemporaryFile() # max_tmp_size comes in unit of MB self.max_tmp_size = max_tmp_size * 1000 * 1000 self.webhdfs = WebHDFS("c0tl.com", 50070, "hdfs") # given a dict msg, flatten it to tab delimited string def flatten_msg(self, parsed_msg): msg = ( parsed_msg["data"]["action"] + "\t" + "%s" % parsed_msg["data"]["user_id"] + "\t" + "%s" % parsed_msg["data"]["photo"]["pid"] + "\t" + "%.15f" % parsed_msg["data"]["photo"]["location"]["latitude"] + "\t" + "%.15f" % parsed_msg["data"]["photo"]["location"]["longitude"] + "\t" + parsed_msg["data"]["photo"]["URL"] + "\t" + parsed_msg["data"]["photo"]["title"] + "\t" + parsed_msg["data"]["photo"]["description"] + "\t" + parsed_msg["data"]["photo"]["tags"] + "\t" + "%s" % parsed_msg["data"]["photo"]["timeposted"] + "\n" ) return msg def handle_msg(self, parsed_msg): msg = self.flatten_msg(parsed_msg) print msg self.ftmp.write(msg) # if the tmp file size exceeds certain limits, flush it to HDFS if self.ftmp.tell() > self.max_tmp_size: self.flush_to_hdfs() def flush_to_hdfs(self): print "Flushing.." self.logger.info("Flushing tmp file") self.ftmp.flush() self.logger.info("Copying to HDFS..") # use the currrent timestamp as the hdfs file name hdfs_name = datetime.fromtimestamp(time.time()).strftime("%Y%m%d_%H%M%S") self.webhdfs.copyFromLocal(self.ftmp.name, "/user/photo_dump/post/%s.dat" % hdfs_name) self.ftmp.close() # create new temp file self.ftmp = tempfile.NamedTemporaryFile() def __del__(self): # before it exits, write the last file to hdfs self.logger.info("Exit Cleaning..") self.flush_to_hdfs() self.ftmp.close()
def __init__(self, service_url): self.service_url = service_url try: result = urlparse.urlparse(service_url) self.host = result.netloc self.path = result.path except: logger.error("Error parsing URL.") self.__state = State.New self.__webhdfs = WebHDFS(self.HDFS_SERVICE_HOST, self.HDFS_SERVICE_PORT, self.HDFS_USER_NAME)
def __init__(self, group_name, topic_name, max_tmp_size=50, timeout=15, filename='config.txt'): BaseConsumer.__init__(self, group_name, topic_name, timeout=timeout, filename=filename) self.ftmp = tempfile.NamedTemporaryFile() # max_tmp_size comes in unit of MB self.max_tmp_size = max_tmp_size * 1000 * 1000 self.webhdfs = WebHDFS("c0tl.com", 50070, "hdfs")
class HDFS_Consumer(BaseConsumer): # given a single parsed message, add the user-photo info to the user_photos table def __init__(self,group_name, topic_name, max_tmp_size=50,timeout=15, filename='config.txt'): BaseConsumer.__init__(self,group_name, topic_name,timeout=timeout, filename=filename) self.ftmp = tempfile.NamedTemporaryFile() # max_tmp_size comes in unit of MB self.max_tmp_size = max_tmp_size*1000*1000 self.webhdfs = WebHDFS("c0tl.com", 50070, "hdfs") # given a dict msg, flatten it to tab delimited string def flatten_msg(self, parsed_msg): msg = parsed_msg['data']['action'] + '\t' + \ "%s" % parsed_msg['data']['user_id'] + '\t' +\ "%s" % parsed_msg['data']['photo']['pid'] + '\t' + \ "%.15f" % parsed_msg['data']['photo']['location']['latitude'] + '\t' +\ "%.15f" % parsed_msg['data']['photo']['location']['longitude'] + '\t' +\ parsed_msg['data']['photo']['URL'] + '\t' +\ parsed_msg['data']['photo']['title'] + '\t' +\ parsed_msg['data']['photo']['description'] + '\t' +\ parsed_msg['data']['photo']['tags'] + '\t' +\ "%s" % parsed_msg['data']['photo']['timeposted'] + '\n' return msg def handle_msg(self, parsed_msg): msg = self.flatten_msg(parsed_msg) print msg self.ftmp.write(msg) # if the tmp file size exceeds certain limits, flush it to HDFS if self.ftmp.tell()>self.max_tmp_size: self.flush_to_hdfs() def flush_to_hdfs(self): print "Flushing.." self.logger.info("Flushing tmp file") self.ftmp.flush() self.logger.info("Copying to HDFS..") # use the currrent timestamp as the hdfs file name hdfs_name = datetime.fromtimestamp(time.time()).strftime('%Y%m%d_%H%M%S') self.webhdfs.copyFromLocal(self.ftmp.name, "/user/photo_dump/post/%s.dat"%hdfs_name) self.ftmp.close() # create new temp file self.ftmp = tempfile.NamedTemporaryFile() def __del__(self): # before it exits, write the last file to hdfs self.logger.info("Exit Cleaning..") self.flush_to_hdfs() self.ftmp.close()
def __init__(self, service_url): self.service_url = service_url try: result = urlparse.urlparse(service_url) self.host = result.netloc self.path = result.path except: logger.error("Error parsing URL.") self.__state=State.New self.__webhdfs= WebHDFS(self.HDFS_SERVICE_HOST, self.HDFS_SERVICE_PORT, self.HDFS_USER_NAME)
class WebHDFSFileAdaptor(object): HDFS_USER_NAME="luckow" HDFS_SERVICE_HOST="192.168.2.108" HDFS_SERVICE_PORT=50070 def __init__(self, service_url): self.service_url = service_url try: result = urlparse.urlparse(service_url) self.host = result.netloc self.path = result.path except: logger.error("Error parsing URL.") self.__state=State.New self.__webhdfs= WebHDFS(self.HDFS_SERVICE_HOST, self.HDFS_SERVICE_PORT, self.HDFS_USER_NAME) def initialize_pilotstore(self): self.__webhdfs.mkdir(self.path) def get_pilotstore_size(self): return 0 def delete_pilotstore(self): self.__webhdfs.rmdir(self.path) def get_state(self): return self.__state def create_pd(self, pd_id): pd_dir = self.__get_pd_path(pd_id) logger.debug("mkdir: " + pd_dir) self.__webhdfs.mkdir(pd_dir) def put_pd(self, pd): for i in pd.list_data_units(): remote_path = os.path.join(self.__get_pd_path(pd.id), os.path.basename(i.local_url)) logger.debug("Put file: %s to %s"%(i.local_url, remote_path)) if i.local_url.startswith("file://") or i.local_url.startswith("/"): if stat.S_ISDIR(os.stat(i.local_url).st_mode): logger.warning("Path %s is a directory. Ignored."%i.local_url) continue self.__webhdfs.copyFromLocal(i.local_url, remote_path) else: logger.error("File URLs: %s not supported"%i.local_url) def copy_pd_to_url(self, pd, local_url, remote_url): if not remote_url.startswith("file://") and not remote_url.startswith("/"): logger.error("Only local URLs supported") return result = urlparse.urlparse(remote_url) path = result.path # create directory try: os.makedirs(path) except: logger.debug("Directory: %s already exists."%path) base_dir = self.__get_pd_path(pd.id) for filename in self.__webhdfs.listdir(base_dir): file_url = local_url + "/" + filename file_remote_url = remote_url + "/" + filename logger.debug("GET " + file_url + " to " + file_remote_url) self.__webhdfs.copyToLocal(file_url, file_remote_url) def copy_pd(self, pd, ps_new): pass def get_pd(self, pd, target_url): remote_url = target_url local_url = self.__get_pd_path(pd.id) self.copy_pd_to_url(pd, local_url, remote_url) def remove_pd(self, pd): self.__webhdfs.rmdir(self.__get_pd_path(pd.id)) ########################################################################### # Internal methods def __get_pd_path(self, pd_id): return os.path.join(self.path, str(pd_id))
from webhdfs.webhdfs import WebHDFS import os, tempfile import time import getpass webhdfs = WebHDFS("localhost", 50070, getpass.getuser()) webhdfs.mkdir("/hello-world") # create a temporary file f = tempfile.NamedTemporaryFile() f.write(b'Hello world!\n') f.flush() print "Upload file: " + f.name webhdfs.copyFromLocal(f.name, "hello-world/test.txt") webhdfs.copyToLocal("hello-world/test.txt", "test1.txt") f.close()
from webhdfs.webhdfs import WebHDFS import os, tempfile import time webhdfs = WebHDFS("localhost", 50070, "luckow") webhdfs.mkdir("/tmp/hello-world/") # create a temporary file f = tempfile.NamedTemporaryFile() f.write(b'Hello world!\n') f.flush() print "Upload file: " + f.name webhdfs.copyFromLocal(f.name, "/tmp/test.txt") webhdfs.copyToLocal("/hello-world/test.txt", "/tmp/test1.txt") for i in webhdfs.listdir("/hello-world/"): print str(i) f.close()
def __init__(self, group_name, topic_name, max_tmp_size=50, timeout=15, filename="config.txt"): BaseConsumer.__init__(self, group_name, topic_name, timeout=timeout, filename=filename) self.ftmp = tempfile.NamedTemporaryFile() # max_tmp_size comes in unit of MB self.max_tmp_size = max_tmp_size * 1000 * 1000 self.webhdfs = WebHDFS("c0tl.com", 50070, "hdfs")
from webhdfs.webhdfs import WebHDFS import os, tempfile import time DATA_PATH = "/N/u/luckow/DATA_BFAST/hg18chr21_10" start = time.time() webhdfs = WebHDFS("localhost", 50070, "luckow") webhdfs.mkdir("/hg18chr21_10/") for i in os.listdir(DATA_PATH): filename = os.path.join(DATA_PATH, i) print "Upload file: " + filename webhdfs.copyFromLocal( filename, os.path.join("hg18chr21_10", os.path.basename(filename))) elapsed_time = time.time() - start print "Upload Time: " + str(elapsed_time) + " sec"
from webhdfs.webhdfs import WebHDFS import os, tempfile import time DATA_PATH="/N/u/luckow/DATA_BFAST/hg18chr21_10" start = time.time() webhdfs = WebHDFS("localhost", 50070, "luckow") webhdfs.mkdir("/hg18chr21_10/") for i in os.listdir(DATA_PATH): filename = os.path.join(DATA_PATH, i) print "Upload file: " + filename webhdfs.copyFromLocal(filename, os.path.join("hg18chr21_10", os.path.basename(filename))) elapsed_time = time.time()-start print "Upload Time: " + str(elapsed_time) + " sec"
def main(): with open(os.path.expanduser('~') + '/.whdfsc.json', 'r') as f: test_config = json.load(f) hdfs = WebHDFS(**test_config) print " > echo -n '1234567890' > test.txt" hdfs.create('test.txt', lsrc=__file__, overwrite=True) print " > echo -n 'abcdefg' >> test.txt" hdfs.append('test.txt', data='abcdefg\n') print " > ls test.txt" print hdfs.list_status('test.txt') print " > mkdir example" print hdfs.mkdirs('example') print " > ls example" print hdfs.list_status('example') print " > mv test.txt example/test.txt" print hdfs.rename('test.txt', 'example/test.txt') print " > ls example" print hdfs.list_status('example') print " > cat example/test.txt" print hdfs.open('example/test.txt') print " > rm -r example" print hdfs.delete('example', recursive=True)
class WebHDFSFileAdaptor(object): HDFS_USER_NAME = "luckow" HDFS_SERVICE_HOST = "192.168.2.108" HDFS_SERVICE_PORT = 50070 def __init__(self, service_url): self.service_url = service_url try: result = urlparse.urlparse(service_url) self.host = result.netloc self.path = result.path except: logger.error("Error parsing URL.") self.__state = State.New self.__webhdfs = WebHDFS(self.HDFS_SERVICE_HOST, self.HDFS_SERVICE_PORT, self.HDFS_USER_NAME) def get_security_context(self): """ Returns security context that needs to be available on the distributed node in order to access this Pilot Data """ return None def initialize_pilotstore(self): self.__webhdfs.mkdir(self.path) def get_pilotstore_size(self): return 0 def delete_pilotstore(self): self.__webhdfs.rmdir(self.path) def get_state(self): return self.__state def create_pd(self, pd_id): pd_dir = self.__get_pd_path(pd_id) logger.debug("mkdir: " + pd_dir) self.__webhdfs.mkdir(pd_dir) def put_pd(self, pd): for i in pd.list_data_units(): remote_path = os.path.join(self.__get_pd_path(pd.id), os.path.basename(i.local_url)) logger.debug("Put file: %s to %s" % (i.local_url, remote_path)) if i.local_url.startswith("file://") or i.local_url.startswith( "/"): if stat.S_ISDIR(os.stat(i.local_url).st_mode): logger.warning("Path %s is a directory. Ignored." % i.local_url) continue self.__webhdfs.copyFromLocal(i.local_url, remote_path) else: logger.error("File URLs: %s not supported" % i.local_url) def copy_pd_to_url(self, pd, local_url, remote_url): if not remote_url.startswith("file://") and not remote_url.startswith( "/"): logger.error("Only local URLs supported") return result = urlparse.urlparse(remote_url) path = result.path # create directory try: os.makedirs(path) except: logger.debug("Directory: %s already exists." % path) base_dir = self.__get_pd_path(pd.id) for filename in self.__webhdfs.listdir(base_dir): file_url = local_url + "/" + filename file_remote_url = remote_url + "/" + filename logger.debug("GET " + file_url + " to " + file_remote_url) self.__webhdfs.copyToLocal(file_url, file_remote_url) def copy_pd(self, pd, ps_new): pass def get_pd(self, pd, target_url): remote_url = target_url local_url = self.__get_pd_path(pd.id) self.copy_pd_to_url(pd, local_url, remote_url) def remove_pd(self, pd): self.__webhdfs.rmdir(self.__get_pd_path(pd.id)) ########################################################################### # Internal methods def __get_pd_path(self, pd_id): return os.path.join(self.path, str(pd_id))