Esempio n. 1
0
def main(argv):
    """
    Main method.

    This method performs the following tasks:
    1. Parse command line arguments
    2. Retrieve credentials and connect to Cloudant and WebHDFS
    3. Connect to the Cloudant `_changes` feed for checkpointed document
       consumption
    4. Process each change individually.
    5. Upon exception throwing, store the latest checkpoint to local file and
       exit.
    """

    #add options into the parser
    parser = configureOptions()
    (options, args) = parser.parse_args()
    checkRequiredArguments(options, parser)
    print options

    # configurations
    last_seq = options.last_seq

    #get credential
    perm_file = '%s/.clou' % os.environ['HOME']
    creds = get_creds(perm_file)

    #connect to source database
    s = Server('https://%s:%s@%s' %
               (creds['cloudant_user'], creds['cloudant_pwd'], options.uri))
    db = s[options.dbname]
    #print db.info()

    #connect to target hdfs cluster
    hdfs = PyWebHdfsClient(host=options.hdfs_host,
                           port=options.hdfs_port,
                           user_name=creds['hdfs_user'])
    hdfs.make_dir(options.hdfs_path)

    #and here we consume the cloudant `_changes` feed
    counter = 0
    changestream = ChangesStream(db,
                                 include_docs=True,
                                 heartbeat=True,
                                 since=last_seq)
    for c in changestream:
        #print c
        try:
            if counter % 100 == 0:
                checkpoint(last_seq)
            seq = processChange(hdfs, c, options.hdfs_path)
            if seq:  # protect against the last line being blank
                last_seq = seq
                counter += 1
        except Exception:
            traceback.print_exc()
            checkpoint(last_seq)
            os._exit(1)

    checkpoint(last_seq)
Esempio n. 2
0
def setup_common_oozie_libs(name_node):
    webhdfs_port = '14000'
    webhdfs_user = '******'
    platform_dir = 'user/deployment/platform'
    lib_path_list = ['/usr/hdp/current/hbase-client/lib/hbase-client.jar',
                     '/usr/hdp/current/hbase-client/lib/hbase-common.jar',
                     '/usr/hdp/current/hbase-client/lib/hbase-protocol.jar',
                     '/usr/hdp/current/hbase-client/lib/hbase-server.jar',
                     '/usr/hdp/current/hbase-client/lib/htrace-core-3.1.0-incubating.jar',
                     '/usr/hdp/current/hbase-client/lib/hbase-hadoop-compat.jar',
                     '/usr/hdp/current/hbase-client/lib/hbase-it.jar',
                     '/usr/hdp/current/hbase-client/lib/hbase-prefix-tree.jar',
                     '/usr/hdp/current/hbase-client/lib/zookeeper.jar',
                     '/usr/hdp/current/pig-client/piggybank.jar',
                     '/usr/hdp/current/spark-client/lib/spark-examples.jar']

    # Setup a connection with hdfs using namenode.
    hdfs_client = PyWebHdfsClient(host=name_node, port=webhdfs_port, user_name=webhdfs_user, timeout=None)
    # Create directory on hadoop file system (HDFS).
    hdfs_client.make_dir(platform_dir)
    # Creates a new file on HDFS and write contents from local FS.
    for path in lib_path_list:
        platform_file = '%s/%s' % (platform_dir, os.path.basename(path))
        print 'Copying source file: %s to HDFS path %s' % (path, platform_file)
        with open(path) as file_data:
            try:
                hdfs_client.create_file(platform_file, file_data, overwrite=True)
            except PyWebHdfsException:
                print 'retrying HDFS copy command for %s' % platform_file
                time.sleep(5)
                hdfs_client.create_file(platform_file, file_data, overwrite=True)
Esempio n. 3
0
File: hdfs.py Progetto: bkanuka/pymc
class HDFS(NDArray):
	'''
	HDFS storage

	Parameters
	----------
	name : str
		Name of directory to store text files (Path to the directory) without
		a leading '/'
	model : Model
		If None, the model is taken from the 'with' context
	vars : list of variables
		Sampling values will be stored for these variables. If None.
		'model.unobserved_RVs' is used
	host : str
		The IP address or hostname of the HDFS namenode. By default,
		it is 'localhost'
	port : str
		The port number for WebHDFS on the namenode. By default, it
		is '50070'
	user_name : str
		WebHDFS user_name used for authentication. By default, it is
		None
	'''
	def __init__(self, name, model=None, vars=None, host='localhost', port='50070', user_name=None):
		self.hdfs = PyWebHdfsClient(host=host, port=port, user_name=user_name)
		try:
			self.hdfs.list_dir(name)
		except FileNotFound:
			self.hdfs.make_dir(name)
		super(HDFS, self).__init__(name, model, vars)

	def close(self):
		super(HDFS, self).close()
		_dump_trace(self.name, self)
Esempio n. 4
0
def setup_common_oozie_libs(name_node):
    webhdfs_port = '14000'
    webhdfs_user = '******'
    platform_dir = 'user/deployment/platform'
    lib_path_list = [
        '/opt/cloudera/parcels/CDH/lib/hbase/hbase-client.jar',
        '/opt/cloudera/parcels/CDH/lib/hbase/hbase-common.jar',
        '/opt/cloudera/parcels/CDH/lib/hbase/hbase-protocol.jar',
        '/opt/cloudera/parcels/CDH/lib/hbase/hbase-server.jar',
        '/opt/cloudera/parcels/CDH/lib/hbase/lib/htrace-core.jar',
        '/opt/cloudera/parcels/CDH/lib/hbase/hbase-hadoop-compat.jar',
        '/opt/cloudera/parcels/CDH/lib/hbase/hbase-it.jar',
        '/opt/cloudera/parcels/CDH/lib/hbase/hbase-prefix-tree.jar',
        '/opt/cloudera/parcels/CDH/lib/hbase/lib/zookeeper.jar',
        '/opt/cloudera/parcels/CDH/lib/pig/piggybank.jar'
    ]

    # Setup a connection with hdfs using namenode.
    hdfs_client = PyWebHdfsClient(host=name_node,
                                  port=webhdfs_port,
                                  user_name=webhdfs_user,
                                  timeout=None)
    # Create directory on hadoop file system (HDFS).
    hdfs_client.make_dir(platform_dir)
    # Creates a new file on HDFS and write contents from local FS.
    for path in lib_path_list:
        platform_file = '%s/%s' % (platform_dir, os.path.basename(path))
        logging.info('Copying source file: %s to HDFS path %s', path,
                     platform_file)
        with open(path) as file_data:
            hdfs_client.create_file(platform_file, file_data, overwrite=True)
Esempio n. 5
0
def setup_common_oozie_libs(name_node):
    webhdfs_port = '14000'
    webhdfs_user = '******'
    platform_dir = 'user/deployment/platform'
    lib_path_list = ['/opt/cloudera/parcels/CDH/lib/hbase/hbase-client.jar',
                     '/opt/cloudera/parcels/CDH/lib/hbase/hbase-common.jar',
                     '/opt/cloudera/parcels/CDH/lib/hbase/hbase-protocol.jar',
                     '/opt/cloudera/parcels/CDH/lib/hbase/hbase-server.jar',
                     '/opt/cloudera/parcels/CDH/lib/hbase/lib/htrace-core.jar',
                     '/opt/cloudera/parcels/CDH/lib/hbase/hbase-hadoop-compat.jar',
                     '/opt/cloudera/parcels/CDH/lib/hbase/hbase-it.jar',
                     '/opt/cloudera/parcels/CDH/lib/hbase/hbase-prefix-tree.jar',
                     '/opt/cloudera/parcels/CDH/lib/hbase/lib/zookeeper.jar',
                     '/opt/cloudera/parcels/CDH/lib/pig/piggybank.jar',
                     '/opt/cloudera/parcels/CDH/lib/spark/lib/spark-examples.jar']

    # Setup a connection with hdfs using namenode.
    hdfs_client = PyWebHdfsClient(host=name_node, port=webhdfs_port, user_name=webhdfs_user, timeout=None)
    # Create directory on hadoop file system (HDFS).
    hdfs_client.make_dir(platform_dir)
    # Creates a new file on HDFS and write contents from local FS.
    for path in lib_path_list:
        platform_file = '%s/%s' % (platform_dir, os.path.basename(path))
        logging.info('Copying source file: %s to HDFS path %s', path, platform_file)
        with open(path) as file_data:
            hdfs_client.create_file(platform_file, file_data, overwrite=True)
Esempio n. 6
0
class WhenTestingMkdirOperation(unittest.TestCase):
    def setUp(self):

        self.host = 'hostname'
        self.port = '00000'
        self.user_name = 'username'
        self.webhdfs = PyWebHdfsClient(host=self.host,
                                       port=self.port,
                                       user_name=self.user_name)
        self.response = MagicMock()
        self.requests = MagicMock(return_value=self.response)
        self.path = 'user/hdfs'
        self.response = MagicMock()

    def test_mkdir_throws_exception_for_not_ok(self):

        self.response.status_code = http_client.BAD_REQUEST
        self.requests.put.return_value = self.response
        with patch('pywebhdfs.webhdfs.requests', self.requests):
            with self.assertRaises(errors.PyWebHdfsException):
                self.webhdfs.make_dir(self.path)

    def test_mkdir_returns_true(self):

        self.response.status_code = http_client.OK
        self.requests.put.return_value = self.response
        with patch('pywebhdfs.webhdfs.requests', self.requests):
            result = self.webhdfs.make_dir(self.path)
        self.assertTrue(result)
Esempio n. 7
0
class WhenTestingMkdirOperation(unittest.TestCase):

    def setUp(self):

        self.host = 'hostname'
        self.port = '00000'
        self.user_name = 'username'
        self.webhdfs = PyWebHdfsClient(host=self.host, port=self.port,
                                       user_name=self.user_name)
        self.response = MagicMock()
        self.requests = MagicMock(return_value=self.response)
        self.path = 'user/hdfs'
        self.response = MagicMock()

    def test_mkdir_throws_exception_for_not_ok(self):

        self.response.status_code = httplib.BAD_REQUEST
        self.requests.put.return_value = self.response
        with patch('pywebhdfs.webhdfs.requests', self.requests):
            with self.assertRaises(errors.PyWebHdfsException):
                self.webhdfs.make_dir(self.path)

    def test_mkdir_returns_true(self):

        self.response.status_code = httplib.OK
        self.requests.put.return_value = self.response
        with patch('pywebhdfs.webhdfs.requests', self.requests):
            result = self.webhdfs.make_dir(self.path)
        self.assertTrue(result)
Esempio n. 8
0
    def test_webhdfs_csv(self):
        from pywebhdfs.webhdfs import PyWebHdfsClient
        dfs = PyWebHdfsClient(host='localhost',port='9870', user_name='hadoop')
        dfs.make_dir("/temp")

        with open("tests/data/data.csv") as input_file:
            dfs.create_file("/temp/data.csv", file_data=input_file, overwrite=True)

        dfs.delete_file_dir("/temp", recursive=True)
Esempio n. 9
0
def main(argv):
    """
    Main method.

    This method performs the following tasks:
    1. Parse command line arguments
    2. Retrieve credentials and connect to Cloudant and WebHDFS
    3. Connect to the Cloudant `_changes` feed for checkpointed document
       consumption
    4. Process each change individually.
    5. Upon exception throwing, store the latest checkpoint to local file and
       exit.
    """

    # add options into the parser
    parser = configureOptions()
    (options, args) = parser.parse_args()
    checkRequiredArguments(options, parser)
    print options

    # configurations
    last_seq = options.last_seq

    # get credential
    perm_file = "%s/.clou" % os.environ["HOME"]
    creds = get_creds(perm_file)

    # connect to source database
    s = Server("https://%s:%s@%s" % (creds["cloudant_user"], creds["cloudant_pwd"], options.uri))
    db = s[options.dbname]
    # print db.info()

    # connect to target hdfs cluster
    hdfs = PyWebHdfsClient(host=options.hdfs_host, port=options.hdfs_port, user_name=creds["hdfs_user"])
    hdfs.make_dir(options.hdfs_path)

    # and here we consume the cloudant `_changes` feed
    counter = 0
    changestream = ChangesStream(db, include_docs=True, heartbeat=True, since=last_seq)
    for c in changestream:
        # print c
        try:
            if counter % 100 == 0:
                checkpoint(last_seq)
            seq = processChange(hdfs, c, options.hdfs_path)
            if seq:  # protect against the last line being blank
                last_seq = seq
                counter += 1
        except Exception:
            traceback.print_exc()
            checkpoint(last_seq)
            os._exit(1)

    checkpoint(last_seq)
Esempio n. 10
0
def sharedlib_install(name_node, webhdfs_port, authentic_user, platform_dir,
                      lib_path_list):
    # Setup a connection with hdfs using namenode.
    hdfs = PyWebHdfsClient(host=name_node,
                           port=webhdfs_port,
                           user_name=authentic_user,
                           timeout=None)
    # Create directory on hadoop file system (HDFS).
    hdfs.make_dir(platform_dir)
    # Creates a new file on HDFS and write contents from local FS.
    for path in lib_path_list:
        platform_file = '%s/%s' % (platform_dir, os.path.basename(path))
        print >> sys.stdout, 'Copying source file: %s to HDFS path %s' %\
                             (path, platform_file)
        with open(path) as file_data:
            hdfs.create_file(platform_file, file_data, overwrite=True)
Esempio n. 11
0
class Store (store.Store):
    """
    HDFS backed store.
    """

    def __init__ (self):
        """ Connect to store """
        self._client = PyWebHdfsClient(host=store_host, port=store_port, user_name=store_user)

    def mkdir (self, path):
        self._client.make_dir(path)

    def read (self, path, open_handle):
        return StoreFile(self._client, path, "r", open_handle)

    def append (self, path, open_handle):
        return StoreFile(self._client, path, "a", open_handle)

    def write (self, path, open_handle):
        return StoreFile(self._client, path, "w", open_handle)

    def exists (self, path):
        try:
            dirinfo = self._client.list_dir(path)
            return True
        except errors.FileNotFound:
            return False
    
    def walk (self, path, visitor, recursive = False):
        """ Walk files in a path. Use recursive=True to include subdirs """
        dirinfo = self._client.list_dir(path)
        for status in dirinfo["FileStatuses"]["FileStatus"]:
            if recursive and status["type"] == "DIRECTORY":
                if len(path) > 0:
                    self.walk(path + "/" + status["pathSuffix"], visitor, recursive)
                else:
                    self.walk(status["pathSuffix"], visitor, recursive)
            else:
                info = dict(name=status["pathSuffix"], 
                            modify=datetime.fromtimestamp(status["modificationTime"]), 
                            size=status["length"])
                visitor(path, info)
Esempio n. 12
0
class HDFS(NDArray):
    '''
	HDFS storage

	Parameters
	----------
	name : str
		Name of directory to store text files (Path to the directory) without
		a leading '/'
	model : Model
		If None, the model is taken from the 'with' context
	vars : list of variables
		Sampling values will be stored for these variables. If None.
		'model.unobserved_RVs' is used
	host : str
		The IP address or hostname of the HDFS namenode. By default,
		it is 'localhost'
	port : str
		The port number for WebHDFS on the namenode. By default, it
		is '50070'
	user_name : str
		WebHDFS user_name used for authentication. By default, it is
		None
	'''
    def __init__(self,
                 name,
                 model=None,
                 vars=None,
                 host='localhost',
                 port='50070',
                 user_name=None):
        self.hdfs = PyWebHdfsClient(host=host, port=port, user_name=user_name)
        try:
            self.hdfs.list_dir(name)
        except FileNotFound:
            self.hdfs.make_dir(name)
        super(HDFS, self).__init__(name, model, vars)

    def close(self):
        super(HDFS, self).close()
        _dump_trace(self.name, self)
Esempio n. 13
0
def setup_common_oozie_libs(name_node):
    webhdfs_port = '14000'
    webhdfs_user = '******'
    platform_dir = 'user/deployment/platform'
    lib_path_list = [
        '/usr/hdp/current/hbase-client/lib/hbase-client.jar',
        '/usr/hdp/current/hbase-client/lib/hbase-common.jar',
        '/usr/hdp/current/hbase-client/lib/hbase-protocol.jar',
        '/usr/hdp/current/hbase-client/lib/hbase-server.jar',
        '/usr/hdp/current/hbase-client/lib/htrace-core-3.1.0-incubating.jar',
        '/usr/hdp/current/hbase-client/lib/hbase-hadoop-compat.jar',
        '/usr/hdp/current/hbase-client/lib/hbase-it.jar',
        '/usr/hdp/current/hbase-client/lib/hbase-prefix-tree.jar',
        '/usr/hdp/current/hbase-client/lib/zookeeper.jar',
        '/usr/hdp/current/pig-client/piggybank.jar',
        '/usr/hdp/current/spark-client/lib/spark-examples.jar'
    ]

    # Setup a connection with hdfs using namenode.
    hdfs_client = PyWebHdfsClient(host=name_node,
                                  port=webhdfs_port,
                                  user_name=webhdfs_user,
                                  timeout=None)
    # Create directory on hadoop file system (HDFS).
    hdfs_client.make_dir(platform_dir)
    # Creates a new file on HDFS and write contents from local FS.
    for path in lib_path_list:
        platform_file = '%s/%s' % (platform_dir, os.path.basename(path))
        print 'Copying source file: %s to HDFS path %s' % (path, platform_file)
        with open(path) as file_data:
            try:
                hdfs_client.create_file(platform_file,
                                        file_data,
                                        overwrite=True)
            except PyWebHdfsException:
                print 'retrying HDFS copy command for %s' % platform_file
                time.sleep(5)
                hdfs_client.create_file(platform_file,
                                        file_data,
                                        overwrite=True)
#1 imports

from pywebhdfs.webhdfs import PyWebHdfsClient

#2 make connection with hadoop file system

hdfs = PyWebHdfsClient(user_name="hdfs",port=50070,host="sandbox.hortonworks.com")


hdfs.delete_file_dir('chapter5/LoanStats3d.csv',recursive=True)

#4 recreate the chapters directory

hdfs.make_dir('chapter5')

#5 upload the csv file

with open('./data/stored_csv.csv') as file_data:
	hdfs.create_file('chapter5/LoanStats3d.csv',file_data, overwrite=True)

#6 print the status to see if this succeeded.
print hdfs.get_file_dir_status('chapter5/LoanStats3d.csv')
class HDFS(object):
    def __init__(self, host, port, user):
        self._hdfs = PyWebHdfsClient(host=host,
                                     port=port,
                                     user_name=user,
                                     timeout=None)
        logging.debug('webhdfs = %s@%s:%s', user, host, port)

    def recursive_copy(self,
                       local_path,
                       remote_path,
                       exclude=None,
                       permission=755):

        if exclude is None:
            exclude = []

        c_path = canonicalize(remote_path)
        logging.debug('making %s', c_path)
        self._hdfs.make_dir(c_path, permission=permission)

        fs_g = os.walk(local_path)
        for dpath, dnames, fnames in fs_g:
            _, relative_path = dpath.split(local_path)
            for dname in dnames:
                if dname not in exclude:
                    c_path = canonicalize('%s/%s/%s' %
                                          (remote_path, relative_path, dname))
                    logging.debug('making %s', c_path)
                    self._hdfs.make_dir(c_path, permission=permission)

            for fname in fnames:
                if fname not in exclude:
                    data = file(
                        canonicalize('%s/%s/%s' %
                                     (local_path, relative_path, fname)), 'rb')
                    c_path = canonicalize('%s/%s/%s' %
                                          (remote_path, relative_path, fname))
                    logging.debug('creating %s', c_path)
                    self._hdfs.create_file(c_path,
                                           data,
                                           overwrite=True,
                                           permission=permission)
                    data.close()

    def make_dir(self, path, permission=755):

        logging.debug('make_dir: %s', path)

        self._hdfs.make_dir(canonicalize(path), permission=permission)

    def create_file(self, data, remote_file_path, permission=755):

        logging.debug('create_file: %s', remote_file_path)

        sio = BytesIO(data)

        self._hdfs.create_file(canonicalize(remote_file_path),
                               sio,
                               overwrite=True,
                               permission=permission)

    def append_file(self, data, remote_file_path):

        logging.debug('append to: %s', remote_file_path)

        self._hdfs.append_file(canonicalize(remote_file_path), data)

    def stream_file_to_disk(self, remote_file_path, local_file_path):
        chunk_size = 10 * 1024 * 1024
        offset = 0
        with open(local_file_path, 'wb') as dest_file:
            data = self._hdfs.read_file(canonicalize(remote_file_path),
                                        offset=offset,
                                        length=chunk_size)
            while True:
                dest_file.write(data)
                if len(data) < chunk_size:
                    break
                offset += chunk_size
                data = self._hdfs.read_file(canonicalize(remote_file_path),
                                            offset=offset,
                                            length=chunk_size)

    def read_file(self, remote_file_path):

        data = self._hdfs.read_file(canonicalize(remote_file_path))

        return data

    def remove(self, path, recursive=False):

        logging.debug('remove: %s', path)

        self._hdfs.delete_file_dir(canonicalize(path), recursive)

    def file_exists(self, path):

        try:
            self._hdfs.get_file_dir_status(path)
            return True
        except:
            return False
Esempio n. 16
0
import requests

source = requests.get("https://resources.lendingclub.com/LoanStats3d.csv.zip",
                      verify=False)
stringio = StringIO.StringIO(source.content)
unzipped = zipfile.ZipFile(stringio)
import pandas as pd
from pywebhdfs.webhdfs import PyWebHdfsClient

subselection_csv = pd.read_csv(unzipped.open('LoanStats3d.csv'),
                               skiprows=1,
                               skipfooter=2,
                               engine='python')
stored_csv = subselection_csv.to_csv('./stored_csv.csv')
hdfs = PyWebHdfsClient(user_name="hdfs", port=50070, host="sandbox")
hdfs.make_dir('chapter5')
with open('./stored_csv.csv') as file_data:
    hdfs.create_file('chapter5/LoanStats3d.csv', file_data, overwrite=True)
print(hdfs.get_file_dir_status('chapter5/LoanStats3d.csv'))

from pyspark.sql import HiveContext

# sc = SparkContext()
sqlContext = HiveContext(sc)
data = sc.textFile("/chapter5/LoanStats3d.csv")
parts = data.map(lambda r: r.split(','))
firstline = parts.first()
datalines = parts.filter(lambda x: x != firstline)


def cleans(row):
Esempio n. 17
0
class MulticlassLogisticRegressionModelTrainer(object):
    pass

    def __init__(self):
        pass

    def hdfsizePath(self, path):
        return self.hdfsServerUrl+path

    def start(self,q,parentEnv, configJsonStr):
        pass
        self.config = DictionaryAsNestedObjectSerializer(json.loads(configJsonStr))
        self.hdfs = PyWebHdfsClient(host=self.config.acm.servers.hdfs.host,port=self.config.acm.servers.hdfs.restPort, user_name=self.config.acm.servers.hdfs.fileOwner)
        self.hdfsServerUrl = "hdfs://"+self.config.acm.servers.hdfs.host+":"+str(self.config.acm.servers.hdfs.port)

        env_ = json.loads(parentEnv)
        py4jExists=False
        for key in env_.keys():
            os.environ[key]=env_[key]
            if "py4j-" in env_[key]:
                py4jExists=True

        ### set pyspark env variable ###
        #os.environ["SPARK_HOME"]="/home/halil/programs/spark230"
        #if os.environ.get("PYTHONPATH") is None:
        #    os.environ["PYTHONPATH"] = os.path.join(os.environ["SPARK_HOME"], "python/")
        #    
        #if py4jExists==False:
        #    os.environ["PYTHONPATH"] = os.path.join(os.environ["SPARK_HOME"], "python/lib/py4j-0.10.6-src.zip")+ ":"+ os.environ["PYTHONPATH"] 


        #set config
        trainDataFiles= self.hdfsizePath(self.config.acm.models.classification.data.hdfs.inputDir+"/*.csv")
        print (trainDataFiles)
        


        sc =SparkContext()
        sqlContext = SQLContext(sc)


        data = sqlContext.read.format('com.databricks.spark.csv').options(header='true', inferschema='true').load(trainDataFiles).limit(1000)
        print(data.columns)

        drop_list = ['Dates', 'DayOfWeek', 'PdDistrict', 'Resolution', 'Address', 'X', 'Y']

        data = data.select([column for column in data.columns if column not in drop_list])
        data.show(5)
        data.printSchema()

        # by top 20 categories
        data.groupBy("Category") \
            .count() \
            .orderBy(col("count").desc()) \
            .show()

        # by top 20 descriptions
        data.groupBy("Descript") \
            .count() \
            .orderBy(col("count").desc()) \
            .show()


        # regular expression tokenizer
        regexTokenizer = RegexTokenizer(inputCol="Descript", outputCol="words", pattern="\\W")

        # stop words
        add_stopwords = ["http","https","amp","rt","t","c","the"] # standard stop words

        stopwordsRemover = StopWordsRemover(inputCol="words", outputCol="filtered").setStopWords(add_stopwords)

        # bag of words count
        countVectors = CountVectorizer(inputCol="filtered", outputCol="features", vocabSize=10000, minDF=5)


        label_stringIdx = StringIndexer(inputCol = "Category", outputCol = "label")

        transformers=[regexTokenizer, stopwordsRemover, countVectors, label_stringIdx]

        pipeline = Pipeline(stages=transformers)

        pipelineFit = pipeline.fit(data)
        dataset = pipelineFit.transform(data)

        dataset.show(5)



        ### Randomly split data into training and test sets. set seed for reproducibility
        (trainingData, testData) = dataset.randomSplit([0.7, 0.3], seed = 100)
        print("Training Dataset Count: " + str(trainingData.count()))
        print("Test Dataset Count: " + str(testData.count()))

        # Build the model
        lr = LogisticRegression(maxIter=20, regParam=0.3, elasticNetParam=0)
        
        # Train model with Training Data
        lrModel = lr.fit(trainingData)
        savedModelsDir=self.hdfsizePath(self.config.acm.models.classification.data.hdfs.savedModels)
        savedModelsZipDir=self.hdfsizePath(self.config.acm.models.classification.data.hdfs.zipDir)
        modelSavePolicy=self.config.acm.models.classification.modelSavePolicy

        if modelSavePolicy=="mostRecentOne":
            time_ms = str(int(time.time()*1000))
            #if not os.path.exists(outputDir):
            #    os.mkdir(outputDir)
            #if not os.path.exists(zipDir):
            #    os.mkdir(zipDir)
            self.hdfs.make_dir(self.config.acm.models.classification.data.hdfs.savedModels)
            self.hdfs.make_dir(self.config.acm.models.classification.data.hdfs.zipDir)
            newModelDirName = self.config.acm.models.classification.data.hdfs.savedModels + "/" + time_ms
            modelOutputPath = newModelDirName+"/model"
            pipelineOutputPath = newModelDirName+"/pipeline"
            self.hdfs.make_dir(newModelDirName)
            self.hdfs.make_dir(modelOutputPath)
            self.hdfs.make_dir(pipelineOutputPath)
            lrModel.write().overwrite().save(self.hdfsizePath( modelOutputPath))
            pipelineFit.write().overwrite().save(self.hdfsizePath(pipelineOutputPath))
class HDFS(object):
    def __init__(self, host, port, user):
        self._hdfs = PyWebHdfsClient(
            host=host, port=port, user_name=user, timeout=None)
        logging.debug('webhdfs = %s@%s:%s', user, host, port)

    def recursive_copy(self, local_path, remote_path, exclude=None):

        if exclude is None:
            exclude = []

        c_path = canonicalize(remote_path)
        logging.debug('making %s', c_path)
        self._hdfs.make_dir(c_path)

        fs_g = os.walk(local_path)
        for dpath, dnames, fnames in fs_g:
            _, relative_path = dpath.split(local_path)
            for dname in dnames:
                if dname not in exclude:
                    c_path = canonicalize(
                        '%s/%s/%s' %
                        (remote_path, relative_path, dname))
                    logging.debug('making %s', c_path)
                    self._hdfs.make_dir(c_path)

            for fname in fnames:
                if fname not in exclude:
                    data = file(
                        canonicalize(
                            '%s/%s/%s' %
                            (local_path, relative_path, fname)), 'rb')
                    c_path = canonicalize(
                        '%s/%s/%s' %
                        (remote_path, relative_path, fname))
                    logging.debug('creating %s', c_path)
                    self._hdfs.create_file(c_path, data, overwrite=True)
                    data.close()

    def make_dir(self, path):

        logging.debug('make_dir: %s', path)

        self._hdfs.make_dir(canonicalize(path))

    def create_file(self, data, remote_file_path):

        logging.debug('create_file: %s', remote_file_path)

        sio = StringIO.StringIO(data)

        self._hdfs.create_file(
            canonicalize(remote_file_path),
            sio,
            overwrite=True)

    def append_file(self, data, remote_file_path):

        logging.debug('append to: %s', remote_file_path)

        self._hdfs.append_file(canonicalize(remote_file_path), data)


    def stream_file_to_disk(self, remote_file_path, local_file_path):
        chunk_size = 10*1024*1024
        offset = 0
        with open(local_file_path, 'wb') as dest_file:
            data = self._hdfs.read_file(canonicalize(remote_file_path), offset=offset, length=chunk_size)
            while True:
                dest_file.write(data)
                if len(data) < chunk_size:
                    break
                offset += chunk_size
                data = self._hdfs.read_file(canonicalize(remote_file_path), offset=offset, length=chunk_size)

    def read_file(self, remote_file_path):

        data = self._hdfs.read_file(canonicalize(remote_file_path))

        return data

    def remove(self, path, recursive=False):

        logging.debug('remove: %s', path)

        self._hdfs.delete_file_dir(canonicalize(path), recursive)
def hdfsPutTrainDataToDir(filePath, dir_):
    fname = os.path.basename(filePath)
    hdfsPut(filePath, dir_ + "/" + fname)


conf = SparkConf()
conf.setMaster("spark://acm-spark-master:7077")
conf.setAppName("batch-multiclass-text-classification")

sc = SparkContext()
sqlContext = SQLContext(sc)

trainDataFile = "./data/sanfrancisco-crime/train.csv"
hdfsPath = "/acm/ml/clsf/data/test001"
modelsPath = hdfsPath + "/models"
hdfs.make_dir(hdfsPath)
hdfs.make_dir(modelsPath)
hdfsPutTrainDataToDir(trainDataFile, hdfsPath)

data = sqlContext.read.format('com.databricks.spark.csv').options(
    header='true',
    inferschema='true').load("hdfs://namenode:9000/" + hdfsPath).limit(1000)
print(data.columns)

drop_list = [
    'Dates', 'DayOfWeek', 'PdDistrict', 'Resolution', 'Address', 'X', 'Y'
]

data = data.select(
    [column for column in data.columns if column not in drop_list])
data.show(5)
Esempio n. 20
0
_LOG = logging.getLogger(__name__)


example_dir = 'user/hdfs/example_dir'
example_file = '{dir}/example.txt'.format(dir=example_dir)
example_data = '01010101010101010101010101010101010101010101\n'
rename_dir = 'user/hdfs/example_rename'


# create a new client instance
hdfs = PyWebHdfsClient(host='localhost', port='50070',
                       user_name='hduser')

# create a new directory for the example
print('making new HDFS directory at: {0}\n'.format(example_dir))
hdfs.make_dir(example_dir)

# get a dictionary of the directory's status
dir_status = hdfs.get_file_dir_status(example_dir)
print(dir_status)

# create a new file on hdfs
print('making new file at: {0}\n'.format(example_file))
hdfs.create_file(example_file, example_data)

file_status = hdfs.get_file_dir_status(example_file)
print(file_status)

# get the checksum for the file
file_checksum = hdfs.get_file_checksum(example_file)
print(file_checksum)
Esempio n. 21
0
#!/usr/bin/env python
from pywebhdfs.webhdfs import PyWebHdfsClient
import os

hdfs = PyWebHdfsClient(host='namenode',port='50070', user_name='root')


def hdfsPut(local_path, hdfs_path):
    with open(local_path) as file_data:
        hdfs.create_file(hdfs_path, file_data=file_data, overwrite=True)


def hdfsPutTrainDataToDir(filePath, dir_):
    fname= os.path.basename(filePath)
    hdfsPath = dir_+"/"+fname
    hdfsPut(filePath, hdfsPath)


trainDataFile="./data/sanfrancisco-crime/train.csv"
hdfsPath="/acm/ml/clsf/data/test001"
hdfs.make_dir(hdfsPath)
hdfsPutTrainDataToDir(trainDataFile,hdfsPath)
Esempio n. 22
0
    def upload_to_hdfs(self, local_file, table, index):
        '''
            upload file from local filesystem to hdfs
        '''
        hiveOper = hive_op.HiveOperation()
        local_dir = self._conf.get('local', 'data_dir')
        local_path = '{}{}/{}'.format(local_dir, index, local_file)
        host1 = self._conf.get('hdfs', 'name_node1')
        host2 = self._conf.get('hdfs', 'name_node2')
        user = self._conf.get('hdfs', 'user')
        port = self._conf.getint('hdfs', 'port')
        hdfs_base_path = self._conf.get('hdfs', 'upload_path')
        hdfs_dir_path = '{}{}'.format(hdfs_base_path, index)
        hdfs_path = '{}{}/{}'.format(hdfs_base_path, index, local_file)

        #implement HA manually
        try:
            hdfs_cli = PyWebHdfsClient(host=host1, port=port, user_name=user)
            hdfs_cli.list_dir('/')
        except Exception as e:
            logger.warn('open hdfs client failed error {}'.format(e))
            hdfs_cli = PyWebHdfsClient(host=host2, port=port, user_name=user)
            hdfs_cli.list_dir('/')

        if hdfs_cli is None:
            logger.error('no active host')
            return None

        try:
            hdfs_cli.get_file_dir_status(hdfs_path)

            # 若hdfs中临时文件存在,表示可能是上次上传hive失败,或者进程中途被杀导致
            # 先将临时文件中的数据导入hive,再进行下一步操作
            ret = hiveOper.load_hdfs_file_into_tmp_table(hdfs_path, table)
            if ret == -1:
                logger.error('load from hdfs to tmp table failed')

            logger.info('last time! {} load into tmp finished'.format(table))
            hiveOper.load_tmp_table_to_main(table)
            logger.info(
                'last time! {} load tmp table to main finished'.format(table))

        #FileNotFountException
        except Exception as e:
            #文件不存在是正常情况
            logger.debug('no such file {}'.format(hdfs_path))

        retry_count = 0
        upload_finished = False
        while retry_count <= 10 and not upload_finished:
            with open(local_path) as f:
                logger.debug('''local path is {}, hdfs_cli is {}, 
                    file is {}, hdfs_path is {}'''.format(
                    local_path, hdfs_cli, f, hdfs_path))
                #hdfs_cli.delete_file_dir(hdfs_path)

                #若目录不存在,先创建目录
                try:
                    hdfs_cli.get_file_dir_status(hdfs_dir_path)
                except Exception as e:
                    hdfs_cli.make_dir(hdfs_dir_path)

                try:
                    hdfs_cli.create_file(hdfs_path, f)
                    upload_finished = True
                except Exception as e:
                    logger.warn('''create file on hdfs failed, 
                        local path is {}, hdfs path is {}, 
                        retry count {}, upload flag {}'''.format(
                        local_path, hdfs_path, retry_count, upload_finished))
                    logger.warn('error is {}'.format(e))
                    retry_count += 1

        if retry_count <= 10:
            return hdfs_path
        else:
            logger.error('''{} upload 10 times, still failed, 
                retry count {}, upload_flag is {}'''.format(
                local_path, retry_count, upload_finished))
            return None
Esempio n. 23
0
class HadoopFileSystem(BaseFs.FileSystem):
    def __init__(self,
                 vcPath,
                 simulateOnly=False,
                 isVerbose=False,
                 logger=None,
                 user=None,
                 host=None,
                 port=None):
        BaseFs.FileSystem.__init__(self, simulateOnly, isVerbose, logger)
        config = Config.Config()
        hdfsUser, hdfsHost, hdfsPort = config.getHadoopConfig(user, host, port)
        self.hdfs = PyWebHdfsClient(host=hdfsHost,
                                    port=hdfsPort,
                                    user_name=hdfsUser)
        self.vcPath = vcPath

    def make_fd(self, path, isSrc, dstDirMustExist):
        fd = None
        try:
            fd = HadoopFileDescriptor(self, path, isSrc, dstDirMustExist)
        except pywebhdfs.errors.FileNotFound:
            self.logger.info("DESC: does not exist: " + path)
            raise Errors.FileNotFound("Path {0} does not exist".format(path))
        except pywebhdfs.errors.Unauthorized as e:
            self.logger.info("Unauthorized for path {0}: {1}".format(path, e))
            raise Errors.Unauthorized(
                "Unauthorized access to the path {0}: {1}".format(path, e))
        except requests.exceptions.RequestException as e:
            self.logger.info("ConnectionError for path {0}: {1}".format(
                path, e))
            raise Errors.BadConnection(
                "Connection error while looking for path: {0}, exc={1}".format(
                    path, e))
        except pywebhdfs.errors.PyWebHdfsException as e:
            self.logger.info("PyWebHdfsException for path {0}: {1}".format(
                path, e))
            raise Errors.FsException(
                "An exception happened while looking for path: {0}, exc={1}".
                format(path, e))
        return fd

    def exists_file_dir(self, fd):
        try:
            return self.hdfs.exists_file_dir(fd.abspath)
        except pywebhdfs.errors.Unauthorized as e:
            self.logger.info("Unauthorized for path {0}: {1}".format(
                fd.abspath, e))
            raise Errors.Unauthorized(
                "Unauthorized access to the path {0}: {1}".format(
                    fd.abspath, e))
        except requests.exceptions.RequestException as e:
            self.logger.info("ConnectionError for path {0}: {1}".format(
                fd.abspath, e))
            raise Errors.BadConnection(
                "Connection error during HDFS exists test: {0}, exc={1}".
                format(fd.abspath, e))
        except pywebhdfs.errors.PyWebHdfsException as e:
            self.logger.info("PyWebHdfsException for path {0}: {1}".format(
                fd.abspath, e))
            raise Errors.FsException(
                "An exception happened during HDFS exists test: {0}, exc={1}".
                format(fd.abspath, e))

    def delete_file_dir(self, fd, recursive=False, force=False):
        if self.simulateOnly:
            print("SIMULATE -> remove file/dir: {0}, recursive={1}".format(
                fd.abspath, recursive))
        else:
            try:
                if not recursive or force or \
                        query_yes_no(question="Are you sure you want to delete folder recursively?", default="no"):
                    status = self.hdfs.delete_file_dir(fd.abspath,
                                                       recursive=recursive)
            except pywebhdfs.errors.Unauthorized as e:
                self.logger.info("Unauthorized for path {0}: {1}".format(
                    fd.abspath, e))
                raise Errors.Unauthorized(
                    "Unauthorized access to the path {0}: {1}".format(
                        fd.abspath, e))
            except requests.exceptions.RequestException as e:
                self.logger.info("ConnectionError for path {0}: {1}".format(
                    fd.abspath, e))
                raise Errors.BadConnection(
                    "Connection error during HDFS delete directory: {0}, exc={1}"
                    .format(fd.abspath, e))
            except pywebhdfs.errors.PyWebHdfsException as e:
                self.logger.info("PyWebHdfsException for path {0}: {1}".format(
                    fd.abspath, e))
                raise Errors.FsException(
                    "An exception happened during HDFS delete directory: {0}, exc={1}"
                    .format(fd.abspath, e))

    def list_dir(self, fd):
        try:
            status = self.hdfs.list_dir(fd.abspath)
        except pywebhdfs.errors.Unauthorized as e:
            self.logger.info("Unauthorized for path {0}: {1}".format(
                fd.abspath, e))
            raise Errors.Unauthorized(
                "Unauthorized access to the path {0}: {1}".format(
                    fd.abspath, e))
        except requests.exceptions.RequestException as e:
            self.logger.info("ConnectionError for path {0}: {1}".format(
                fd.abspath, e))
            raise Errors.BadConnection(
                "Connection error while looking for path: {0}, exc={1}".format(
                    fd.abspath, e))
        except pywebhdfs.errors.PyWebHdfsException as e:
            self.logger.info("PyWebHdfsException for path {0}: {1}".format(
                fd.abspath, e))
            raise Errors.FsException(
                "An exception happened while looking for path: {0}, exc={1}".
                format(fd.abspath, e))
        currentDir = status["FileStatuses"]["FileStatus"]
        for item in currentDir:
            yield HadoopFileDescriptor(self,
                                       fd.abspath,
                                       isSrc=True,
                                       needsDstDirCheck=False,
                                       fileJson=item)

    def make_dir(self, path):
        if self.simulateOnly:
            print("SIMULATE -> make dir: " + path)
        else:
            try:
                self.hdfs.make_dir(path)
            except pywebhdfs.errors.Unauthorized as e:
                self.logger.info("Unauthorized for path {0}: {1}".format(
                    path, e))
                raise Errors.Unauthorized(
                    "Unauthorized access to the path {0}: {1}".format(path, e))
            except requests.exceptions.RequestException as e:
                self.logger.info("ConnectionError for path {0}: {1}".format(
                    path, e))
                raise Errors.BadConnection(
                    "Connection error during HDFS create directory: {0}, exc={1}"
                    .format(path, e))
            except pywebhdfs.errors.PyWebHdfsException as e:
                self.logger.info("PyWebHdfsException for path {0}: {1}".format(
                    path, e))
                raise Errors.FsException(
                    "An exception happened during HDFS create directory: {0}, exc={1}"
                    .format(path, e))

    def open_file(self, fd, rwMode):
        return fd

    def close_file(self, fd):
        pass

    def touch_file(self, fd):
        if self.simulateOnly:
            print("SIMULATE -> touch file: " + fd.abspath)
        else:
            try:
                self.hdfs.create_file(fd.abspath, 0, overwrite=True)
            except pywebhdfs.errors.Unauthorized as e:
                self.logger.info("Unauthorized for path {0}: {1}".format(
                    fd.abspath, e))
                raise Errors.Unauthorized(
                    "Unauthorized access to the path {0}: {1}".format(
                        fd.abspath, e))
            except requests.exceptions.RequestException as e:
                self.logger.info("ConnectionError for path {0}: {1}".format(
                    fd.abspath, e))
                raise Errors.BadConnection(
                    "Connection error during HDFS create file: {0}, exc={1}".
                    format(fd.abspath, e))
            except pywebhdfs.errors.PyWebHdfsException as e:
                self.logger.info("PyWebHdfsException for path {0}: {1}".format(
                    fd.abspath, e))
                raise Errors.FsException(
                    "An exception happened during HDFS create file: {0}, exc={1}"
                    .format(fd.abspath, e))

    def truncate_file(self, fd, size):
        if self.simulateOnly:
            print("SIMULATE -> truncate file: {0}, size={1}".format(
                fd.abspath, size))
        else:
            try:
                self.hdfs.truncate_file(fd.abspath, size)
            except pywebhdfs.errors.Unauthorized as e:
                self.logger.info("Unauthorized for path {0}: {1}".format(
                    fd.abspath, e))
                raise Errors.Unauthorized(
                    "Unauthorized access to the path {0}: {1}".format(
                        fd.abspath, e))
            except requests.exceptions.RequestException as e:
                self.logger.info("ConnectionError for path {0}: {1}".format(
                    fd.abspath, e))
                raise Errors.BadConnection(
                    "Connection error during HDFS truncate file: {0}, exc={1}".
                    format(fd.abspath, e))
            except pywebhdfs.errors.PyWebHdfsException as e:
                self.logger.info("PyWebHdfsException for path {0}: {1}".format(
                    fd.abspath, e))
                raise Errors.FsException(
                    "An exception happened during HDFS truncate file: {0}, exc={1}"
                    .format(fd.abspath, e))

    def try_concat_files(self, fd, chunkFdList):
        # Workaround for unordered concat bug in Hadoop 2.7.1 is to use one source at the time
        # https://issues.apache.org/jira/browse/HDFS-8891
        currIndex = 0
        concatStep = 20
        chunkedList = [
            chunkFdList[pos:pos + concatStep]
            for pos in range(0, len(chunkFdList), concatStep)
        ]
        for sourceChunk in chunkedList:
            try:
                self.concat_files(fd, sourceChunk)
                currIndex += len(sourceChunk)
            except Errors.FsException as e:
                break

        return currIndex

    def concat_files(self, fd, chunkFdList):
        strList = list()
        for chunkFd in chunkFdList:
            strList.append(chunkFd.abspath)

        if self.simulateOnly:
            print("SIMULATE -> concat file: {0}, sources={1}".format(
                fd.abspath, ",".join(strList)))
        else:
            try:
                self.hdfs.concat_files(fd.abspath, strList)
            except pywebhdfs.errors.Unauthorized as e:
                self.logger.info("Unauthorized for path {0}: {1}".format(
                    fd.abspath, e))
                raise Errors.Unauthorized(
                    "Unauthorized access to the path {0}: {1}".format(
                        fd.abspath, e))
            except requests.exceptions.RequestException as e:
                self.logger.info("ConnectionError for path {0}: {1}".format(
                    fd.abspath, e))
                raise Errors.BadConnection(
                    "Connection error during HDFS concat file: {0}, exc={1}".
                    format(fd.abspath, e))
            except pywebhdfs.errors.PyWebHdfsException as e:
                self.logger.info("PyWebHdfsException for path {0}: {1}".format(
                    fd.abspath, e))
                raise Errors.FsException(
                    "An exception happened during HDFS concat file: {0}, exc={1}"
                    .format(fd.abspath, e))

    def read_data(self, fd, offset, size):
        if offset >= fd.size:
            return ""
        else:
            try:
                contents = self.hdfs.read_file(fd.abspath,
                                               offset=offset,
                                               length=size)
            except pywebhdfs.errors.Unauthorized as e:
                self.logger.info("Unauthorized for path {0}: {1}".format(
                    fd.abspath, e))
                raise Errors.Unauthorized(
                    "Unauthorized access to the path {0}: {1}".format(
                        fd.abspath, e))
            except requests.exceptions.RequestException as e:
                self.logger.info("ConnectionError for path {0}: {1}".format(
                    fd.abspath, e))
                raise Errors.BadConnection(
                    "Connection error during HDFS read file: {0}, exc={1}".
                    format(fd.abspath, e))
            except pywebhdfs.errors.PyWebHdfsException as e:
                self.logger.info("PyWebHdfsException for path {0}: {1}".format(
                    fd.abspath, e))
                raise Errors.FsException(
                    "An exception happened during HDFS read file: {0}, exc={1}"
                    .format(fd.abspath, e))
            return contents

    def append_data(self, fd, data):
        if self.simulateOnly:
            print("SIMULATE -> write file data: " + fd.abspath)
        else:
            try:
                self.hdfs.append_file(fd.abspath, data)
            except pywebhdfs.errors.Unauthorized as e:
                self.logger.info("Unauthorized for path {0}: {1}".format(
                    fd.abspath, e))
                raise Errors.Unauthorized(
                    "Unauthorized access to the path {0}: {1}".format(
                        fd.abspath, e))
            except requests.exceptions.RequestException as e:
                self.logger.info("ConnectionError for path {0}: {1}".format(
                    fd.abspath, e))
                raise Errors.BadConnection(
                    "Connection error during HDFS append file: {0}, exc={1}".
                    format(fd.abspath, e))
            except pywebhdfs.errors.PyWebHdfsException as e:
                self.logger.info("PyWebHdfsException for path {0}: {1}".format(
                    fd.abspath, e))
                raise Errors.FsException(
                    "An exception happened during HDFS append file: {0}, exc={1}"
                    .format(fd.abspath, e))

    def local_mv_file(self, src, dst):
        if self.simulateOnly:
            print("SIMULATE -> local move file: {0} -> {1} ".format(
                src.abspath, dst.abspath))
        else:
            try:
                self.hdfs.rename_file_dir(src.abspath, dst.abspath)
            except pywebhdfs.errors.Unauthorized as e:
                self.logger.info("Unauthorized for path {0}: {1}".format(
                    src.abspath, e))
                raise Errors.Unauthorized(
                    "Unauthorized access to the path {0}: {1}".format(
                        src.abspath, e))
            except requests.exceptions.RequestException as e:
                self.logger.info("ConnectionError for path {0}: {1}".format(
                    src.abspath, e))
                raise Errors.BadConnection(
                    "Connection error during HDFS rename file: {0}, exc={1}".
                    format(src.abspath, e))
            except pywebhdfs.errors.PyWebHdfsException as e:
                self.logger.info("PyWebHdfsException for path {0}: {1}".format(
                    src.abspath, e))
                raise Errors.FsException(
                    "An exception happened during HDFS rename file: {0}, exc={1}"
                    .format(src.abspath, e))

    def local_cp_file(self, src, dst):
        # This is an open issue in Hadoop community: https://issues.apache.org/jira/browse/HDFS-3370
        # Instead, we can do a symbolic link
        if self.simulateOnly:
            print("SIMULATE -> local copy file: {0} -> {1} ".format(
                src.abspath, dst.abspath))
        else:
            print(
                "Copy within HDFS is not supported due to lack of Hadoop support"
            )
            print(
                "Once symbolic links are enabled, this feature will be enabled"
            )
            sys.exit(1)
            # self.hdfs.create_sym_link(src.abspath, dst.abspath, createParent=True)

    def get_hdfs_file_dir_json(self, path):
        try:
            status = self.hdfs.get_file_dir_status(path)
            return status["FileStatus"]
        except pywebhdfs.errors.FileNotFound:
            return None

    def validate_hdfs_arg(self, arg):
        if not arg.startswith(self.vcPath):
            print("Error: You don't have permissions to the path: %s" % arg)
            print("Your path must be rooted under: %s" % self.vcPath)
            sys.exit(1)
Esempio n. 24
0
 def save(self):
   hdfs = PyWebHdfsClient(host=os.environ["WEBHDFS_HOST"], port='14000', user_name='oozie')
   coordinator_path = "{0}/{1}/coordinator.xml".format(self.path, self.name)
   hdfs.make_dir(self.path)
   hdfs.create_file(coordinator_path, self.as_xml())
Esempio n. 25
0
import logging

logging.basicConfig(level=logging.DEBUG)
_LOG = logging.getLogger(__name__)

example_dir = 'user/hdfs/example_dir'
example_file = '{dir}/example.txt'.format(dir=example_dir)
example_data = '01010101010101010101010101010101010101010101\n'
rename_dir = 'user/hdfs/example_rename'

# create a new client instance
hdfs = PyWebHdfsClient(host='localhost', port='50070', user_name='hduser')

# create a new directory for the example
print('making new HDFS directory at: {0}\n'.format(example_dir))
hdfs.make_dir(example_dir)

# get a dictionary of the directory's status
dir_status = hdfs.get_file_dir_status(example_dir)
print dir_status

# create a new file on hdfs
print('making new file at: {0}\n'.format(example_file))
hdfs.create_file(example_file, example_data)

file_status = hdfs.get_file_dir_status(example_file)
print file_status

# append to the file created in previous step
print('appending to file at: {0}\n'.format(example_file))
hdfs.append_file(example_file, example_data)
Esempio n. 26
0
 def save(self, workflow_name="workflow.xml"):
     hdfs = PyWebHdfsClient(host=os.environ["WEBHDFS_HOST"], port='14000', user_name='oozie')
     workflow_path = "{0}/{1}/workflow.xml".format(self.path, self.name)
     hdfs.make_dir(self.path)
     hdfs.create_file(workflow_path, self.as_xml())