コード例 #1
0
class HDFS_Consumer(BaseConsumer):
    # given a single parsed message, add the user-photo info to the user_photos table
    def __init__(self, group_name, topic_name, max_tmp_size=50, timeout=15, filename="config.txt"):
        BaseConsumer.__init__(self, group_name, topic_name, timeout=timeout, filename=filename)
        self.ftmp = tempfile.NamedTemporaryFile()
        # max_tmp_size comes in unit of MB
        self.max_tmp_size = max_tmp_size * 1000 * 1000
        self.webhdfs = WebHDFS("c0tl.com", 50070, "hdfs")
        # given a dict msg, flatten it to tab delimited string

    def flatten_msg(self, parsed_msg):
        msg = (
            parsed_msg["data"]["action"]
            + "\t"
            + "%s" % parsed_msg["data"]["user_id"]
            + "\t"
            + "%s" % parsed_msg["data"]["photo"]["pid"]
            + "\t"
            + "%.15f" % parsed_msg["data"]["photo"]["location"]["latitude"]
            + "\t"
            + "%.15f" % parsed_msg["data"]["photo"]["location"]["longitude"]
            + "\t"
            + parsed_msg["data"]["photo"]["URL"]
            + "\t"
            + parsed_msg["data"]["photo"]["title"]
            + "\t"
            + parsed_msg["data"]["photo"]["description"]
            + "\t"
            + parsed_msg["data"]["photo"]["tags"]
            + "\t"
            + "%s" % parsed_msg["data"]["photo"]["timeposted"]
            + "\n"
        )
        return msg

    def handle_msg(self, parsed_msg):
        msg = self.flatten_msg(parsed_msg)
        print msg
        self.ftmp.write(msg)
        # if the tmp file size exceeds certain limits, flush it to HDFS
        if self.ftmp.tell() > self.max_tmp_size:
            self.flush_to_hdfs()

    def flush_to_hdfs(self):
        print "Flushing.."
        self.logger.info("Flushing tmp file")
        self.ftmp.flush()
        self.logger.info("Copying to HDFS..")
        # use the currrent timestamp as the hdfs file name
        hdfs_name = datetime.fromtimestamp(time.time()).strftime("%Y%m%d_%H%M%S")
        self.webhdfs.copyFromLocal(self.ftmp.name, "/user/photo_dump/post/%s.dat" % hdfs_name)
        self.ftmp.close()
        # create new temp file
        self.ftmp = tempfile.NamedTemporaryFile()

    def __del__(self):
        # before it exits, write the last file to hdfs
        self.logger.info("Exit Cleaning..")
        self.flush_to_hdfs()
        self.ftmp.close()
コード例 #2
0
    def __init__(self, service_url):

        self.service_url = service_url

        try:
            result = urlparse.urlparse(service_url)
            self.host = result.netloc
            self.path = result.path
        except:
            logger.error("Error parsing URL.")

        self.__state = State.New
        self.__webhdfs = WebHDFS(self.HDFS_SERVICE_HOST,
                                 self.HDFS_SERVICE_PORT, self.HDFS_USER_NAME)
コード例 #3
0
ファイル: like_consume_hdfs_dump.py プロジェクト: vnisor/cotl
 def __init__(self,
              group_name,
              topic_name,
              max_tmp_size=50,
              timeout=15,
              filename='config.txt'):
     BaseConsumer.__init__(self,
                           group_name,
                           topic_name,
                           timeout=timeout,
                           filename=filename)
     self.ftmp = tempfile.NamedTemporaryFile()
     # max_tmp_size comes in unit of MB
     self.max_tmp_size = max_tmp_size * 1000 * 1000
     self.webhdfs = WebHDFS("c0tl.com", 50070, "hdfs")
コード例 #4
0
class HDFS_Consumer(BaseConsumer):
	# given a single parsed message, add the user-photo info to the user_photos table
	def __init__(self,group_name, topic_name, max_tmp_size=50,timeout=15, filename='config.txt'):
		BaseConsumer.__init__(self,group_name, topic_name,timeout=timeout, filename=filename)
		self.ftmp = tempfile.NamedTemporaryFile()
		# max_tmp_size comes in unit of MB
		self.max_tmp_size = max_tmp_size*1000*1000
		self.webhdfs = WebHDFS("c0tl.com", 50070, "hdfs")
	# given a dict msg, flatten it to tab delimited string
	def flatten_msg(self, parsed_msg):
		msg = parsed_msg['data']['action'] + '\t' + \
				"%s" % parsed_msg['data']['user_id'] + '\t' +\
				"%s" % parsed_msg['data']['photo']['pid'] + '\t' + \
				"%.15f" % parsed_msg['data']['photo']['location']['latitude'] + '\t' +\
				"%.15f" % parsed_msg['data']['photo']['location']['longitude'] + '\t' +\
				parsed_msg['data']['photo']['URL'] + '\t' +\
				parsed_msg['data']['photo']['title'] + '\t' +\
				parsed_msg['data']['photo']['description'] + '\t' +\
				parsed_msg['data']['photo']['tags'] + '\t' +\
				"%s" % parsed_msg['data']['photo']['timeposted'] + '\n'
		return msg

	def handle_msg(self, parsed_msg):
		msg = self.flatten_msg(parsed_msg)
		print msg
		self.ftmp.write(msg)
		# if the tmp file size exceeds certain limits, flush it to HDFS
		if self.ftmp.tell()>self.max_tmp_size:
			self.flush_to_hdfs()

	def flush_to_hdfs(self):
		print "Flushing.."
		self.logger.info("Flushing tmp file")
		self.ftmp.flush()
		self.logger.info("Copying to HDFS..")
		# use the currrent timestamp as the hdfs file name
		hdfs_name = datetime.fromtimestamp(time.time()).strftime('%Y%m%d_%H%M%S')
		self.webhdfs.copyFromLocal(self.ftmp.name, "/user/photo_dump/post/%s.dat"%hdfs_name)
		self.ftmp.close()
		# create new temp file
		self.ftmp = tempfile.NamedTemporaryFile()
	
	def __del__(self):
		# before it exits, write the last file to hdfs
		self.logger.info("Exit Cleaning..")
		self.flush_to_hdfs()
		self.ftmp.close()
コード例 #5
0
 def __init__(self, service_url):     
     
     self.service_url = service_url
     
     try:
         result = urlparse.urlparse(service_url)
         self.host = result.netloc
         self.path = result.path        
     except:
         logger.error("Error parsing URL.")
         
     self.__state=State.New
     self.__webhdfs= WebHDFS(self.HDFS_SERVICE_HOST, 
                            self.HDFS_SERVICE_PORT,
                            self.HDFS_USER_NAME)
コード例 #6
0
class WebHDFSFileAdaptor(object):
    
    HDFS_USER_NAME="luckow"
    HDFS_SERVICE_HOST="192.168.2.108"
    HDFS_SERVICE_PORT=50070     
    
    def __init__(self, service_url):     
        
        self.service_url = service_url
        
        try:
            result = urlparse.urlparse(service_url)
            self.host = result.netloc
            self.path = result.path        
        except:
            logger.error("Error parsing URL.")
            
        self.__state=State.New
        self.__webhdfs= WebHDFS(self.HDFS_SERVICE_HOST, 
                               self.HDFS_SERVICE_PORT,
                               self.HDFS_USER_NAME)
        
        
    def initialize_pilotstore(self):
        self.__webhdfs.mkdir(self.path)
        
        
    def get_pilotstore_size(self):
        return 0
    
    
    def delete_pilotstore(self):
        self.__webhdfs.rmdir(self.path)
        
    def get_state(self):
        return self.__state
            
            
    def create_pd(self, pd_id):
        pd_dir = self.__get_pd_path(pd_id)
        logger.debug("mkdir: " + pd_dir) 
        self.__webhdfs.mkdir(pd_dir)
        
        
    def put_pd(self, pd):
        for i in pd.list_data_units():     
            remote_path = os.path.join(self.__get_pd_path(pd.id), os.path.basename(i.local_url))
            logger.debug("Put file: %s to %s"%(i.local_url, remote_path))
                        
            if i.local_url.startswith("file://") or i.local_url.startswith("/"):
                if stat.S_ISDIR(os.stat(i.local_url).st_mode):
                    logger.warning("Path %s is a directory. Ignored."%i.local_url)                
                    continue            
                self.__webhdfs.copyFromLocal(i.local_url, remote_path)
            else:
                logger.error("File URLs: %s not supported"%i.local_url)
                             

    def copy_pd_to_url(self, pd,  local_url, remote_url):
        
        if not remote_url.startswith("file://") and not remote_url.startswith("/"):
            logger.error("Only local URLs supported")
            return
        
        result = urlparse.urlparse(remote_url)
        path = result.path    
        # create directory
        try:
            os.makedirs(path)
        except:
            logger.debug("Directory: %s already exists."%path)
            
        base_dir = self.__get_pd_path(pd.id)
        for filename in self.__webhdfs.listdir(base_dir):
            file_url = local_url + "/" + filename
            file_remote_url = remote_url + "/" + filename
            logger.debug("GET " + file_url + " to " + file_remote_url)
            self.__webhdfs.copyToLocal(file_url, file_remote_url)



    def copy_pd(self, pd, ps_new):
        pass
    

    def get_pd(self, pd, target_url):
        remote_url = target_url
        local_url =  self.__get_pd_path(pd.id)
        self.copy_pd_to_url(pd, local_url, remote_url)         
    
        
    def remove_pd(self, pd):
        self.__webhdfs.rmdir(self.__get_pd_path(pd.id))
    
    
    ###########################################################################
    # Internal methods
    def __get_pd_path(self, pd_id):
        return os.path.join(self.path, str(pd_id))
コード例 #7
0
ファイル: test-webhdfs.py プロジェクト: jazzwang/hadoop_labs
from webhdfs.webhdfs import WebHDFS
import os, tempfile
import time
import getpass

webhdfs = WebHDFS("localhost", 50070, getpass.getuser())

webhdfs.mkdir("/hello-world")

# create a temporary file
f = tempfile.NamedTemporaryFile()
f.write(b'Hello world!\n')
f.flush()

print "Upload file: " + f.name

webhdfs.copyFromLocal(f.name, "hello-world/test.txt")
webhdfs.copyToLocal("hello-world/test.txt", "test1.txt")

f.close()
コード例 #8
0
ファイル: example.py プロジェクト: ezc/webhdfs-py
from webhdfs.webhdfs import WebHDFS
import os, tempfile
import time

webhdfs = WebHDFS("localhost", 50070, "luckow")

webhdfs.mkdir("/tmp/hello-world/")

# create a temporary file
f = tempfile.NamedTemporaryFile()
f.write(b'Hello world!\n')
f.flush() 

print "Upload file: " + f.name

webhdfs.copyFromLocal(f.name, 
                      "/tmp/test.txt")
    
webhdfs.copyToLocal("/hello-world/test.txt",
                    "/tmp/test1.txt")
    
for i in webhdfs.listdir("/hello-world/"):
    print str(i)
    
f.close()
コード例 #9
0
 def __init__(self, group_name, topic_name, max_tmp_size=50, timeout=15, filename="config.txt"):
     BaseConsumer.__init__(self, group_name, topic_name, timeout=timeout, filename=filename)
     self.ftmp = tempfile.NamedTemporaryFile()
     # max_tmp_size comes in unit of MB
     self.max_tmp_size = max_tmp_size * 1000 * 1000
     self.webhdfs = WebHDFS("c0tl.com", 50070, "hdfs")
コード例 #10
0
from webhdfs.webhdfs import WebHDFS
import os, tempfile
import time

DATA_PATH = "/N/u/luckow/DATA_BFAST/hg18chr21_10"

start = time.time()
webhdfs = WebHDFS("localhost", 50070, "luckow")

webhdfs.mkdir("/hg18chr21_10/")

for i in os.listdir(DATA_PATH):
    filename = os.path.join(DATA_PATH, i)
    print "Upload file: " + filename

    webhdfs.copyFromLocal(
        filename, os.path.join("hg18chr21_10", os.path.basename(filename)))

elapsed_time = time.time() - start
print "Upload Time: " + str(elapsed_time) + " sec"
コード例 #11
0
ファイル: test-webhdfs.py プロジェクト: ikafire/hadoop_labs
from webhdfs.webhdfs import WebHDFS
import os, tempfile
import time
import getpass
 
webhdfs = WebHDFS("localhost", 50070, getpass.getuser())
 
webhdfs.mkdir("/hello-world")
 
# create a temporary file
f = tempfile.NamedTemporaryFile()
f.write(b'Hello world!\n')
f.flush()
 
print "Upload file: " + f.name
 
webhdfs.copyFromLocal(f.name, "hello-world/test.txt")
webhdfs.copyToLocal("hello-world/test.txt", "test1.txt")

f.close()
コード例 #12
0
from webhdfs.webhdfs import WebHDFS
import os, tempfile
import time

DATA_PATH="/N/u/luckow/DATA_BFAST/hg18chr21_10"


start = time.time()
webhdfs = WebHDFS("localhost", 50070, "luckow")

webhdfs.mkdir("/hg18chr21_10/")

for i in os.listdir(DATA_PATH):
    filename = os.path.join(DATA_PATH, i)
    print "Upload file: " + filename

    webhdfs.copyFromLocal(filename, 
                      os.path.join("hg18chr21_10", os.path.basename(filename)))
    
    
elapsed_time = time.time()-start
print "Upload Time: " + str(elapsed_time) + " sec"
コード例 #13
0
def main():
    with open(os.path.expanduser('~') + '/.whdfsc.json', 'r') as f:
        test_config = json.load(f)
    hdfs = WebHDFS(**test_config)

    print " > echo -n '1234567890' > test.txt"
    hdfs.create('test.txt', lsrc=__file__, overwrite=True)

    print " > echo -n 'abcdefg' >> test.txt"
    hdfs.append('test.txt', data='abcdefg\n')

    print " > ls test.txt"
    print hdfs.list_status('test.txt')

    print " > mkdir example"
    print hdfs.mkdirs('example')

    print " > ls example"
    print hdfs.list_status('example')

    print " > mv test.txt example/test.txt"
    print hdfs.rename('test.txt', 'example/test.txt')

    print " > ls example"
    print hdfs.list_status('example')

    print " > cat example/test.txt"
    print hdfs.open('example/test.txt')

    print " > rm -r example"
    print hdfs.delete('example', recursive=True)
コード例 #14
0
class WebHDFSFileAdaptor(object):

    HDFS_USER_NAME = "luckow"
    HDFS_SERVICE_HOST = "192.168.2.108"
    HDFS_SERVICE_PORT = 50070

    def __init__(self, service_url):

        self.service_url = service_url

        try:
            result = urlparse.urlparse(service_url)
            self.host = result.netloc
            self.path = result.path
        except:
            logger.error("Error parsing URL.")

        self.__state = State.New
        self.__webhdfs = WebHDFS(self.HDFS_SERVICE_HOST,
                                 self.HDFS_SERVICE_PORT, self.HDFS_USER_NAME)

    def get_security_context(self):
        """ Returns security context that needs to be available on the distributed
            node in order to access this Pilot Data """
        return None

    def initialize_pilotstore(self):
        self.__webhdfs.mkdir(self.path)

    def get_pilotstore_size(self):
        return 0

    def delete_pilotstore(self):
        self.__webhdfs.rmdir(self.path)

    def get_state(self):
        return self.__state

    def create_pd(self, pd_id):
        pd_dir = self.__get_pd_path(pd_id)
        logger.debug("mkdir: " + pd_dir)
        self.__webhdfs.mkdir(pd_dir)

    def put_pd(self, pd):
        for i in pd.list_data_units():
            remote_path = os.path.join(self.__get_pd_path(pd.id),
                                       os.path.basename(i.local_url))
            logger.debug("Put file: %s to %s" % (i.local_url, remote_path))

            if i.local_url.startswith("file://") or i.local_url.startswith(
                    "/"):
                if stat.S_ISDIR(os.stat(i.local_url).st_mode):
                    logger.warning("Path %s is a directory. Ignored." %
                                   i.local_url)
                    continue
                self.__webhdfs.copyFromLocal(i.local_url, remote_path)
            else:
                logger.error("File URLs: %s not supported" % i.local_url)

    def copy_pd_to_url(self, pd, local_url, remote_url):

        if not remote_url.startswith("file://") and not remote_url.startswith(
                "/"):
            logger.error("Only local URLs supported")
            return

        result = urlparse.urlparse(remote_url)
        path = result.path
        # create directory
        try:
            os.makedirs(path)
        except:
            logger.debug("Directory: %s already exists." % path)

        base_dir = self.__get_pd_path(pd.id)
        for filename in self.__webhdfs.listdir(base_dir):
            file_url = local_url + "/" + filename
            file_remote_url = remote_url + "/" + filename
            logger.debug("GET " + file_url + " to " + file_remote_url)
            self.__webhdfs.copyToLocal(file_url, file_remote_url)

    def copy_pd(self, pd, ps_new):
        pass

    def get_pd(self, pd, target_url):
        remote_url = target_url
        local_url = self.__get_pd_path(pd.id)
        self.copy_pd_to_url(pd, local_url, remote_url)

    def remove_pd(self, pd):
        self.__webhdfs.rmdir(self.__get_pd_path(pd.id))

    ###########################################################################
    # Internal methods
    def __get_pd_path(self, pd_id):
        return os.path.join(self.path, str(pd_id))