Ejemplo n.º 1
0
def main(argv):
    """
    Main method.

    This method performs the following tasks:
    1. Parse command line arguments
    2. Retrieve credentials and connect to Cloudant and WebHDFS
    3. Connect to the Cloudant `_changes` feed for checkpointed document
       consumption
    4. Process each change individually.
    5. Upon exception throwing, store the latest checkpoint to local file and
       exit.
    """

    #add options into the parser
    parser = configureOptions()
    (options, args) = parser.parse_args()
    checkRequiredArguments(options, parser)
    print options

    # configurations
    last_seq = options.last_seq

    #get credential
    perm_file = '%s/.clou' % os.environ['HOME']
    creds = get_creds(perm_file)

    #connect to source database
    s = Server('https://%s:%s@%s' %
               (creds['cloudant_user'], creds['cloudant_pwd'], options.uri))
    db = s[options.dbname]
    #print db.info()

    #connect to target hdfs cluster
    hdfs = PyWebHdfsClient(host=options.hdfs_host,
                           port=options.hdfs_port,
                           user_name=creds['hdfs_user'])
    hdfs.make_dir(options.hdfs_path)

    #and here we consume the cloudant `_changes` feed
    counter = 0
    changestream = ChangesStream(db,
                                 include_docs=True,
                                 heartbeat=True,
                                 since=last_seq)
    for c in changestream:
        #print c
        try:
            if counter % 100 == 0:
                checkpoint(last_seq)
            seq = processChange(hdfs, c, options.hdfs_path)
            if seq:  # protect against the last line being blank
                last_seq = seq
                counter += 1
        except Exception:
            traceback.print_exc()
            checkpoint(last_seq)
            os._exit(1)

    checkpoint(last_seq)
Ejemplo n.º 2
0
class HdfsHandler:
    def __init__(self, hadoopHost, hadopPort='50070', user='******'):
        # self.hdfs = PyWebHdfsClient(host='52.14.121.163', port='50070', user_name='hadoop')
        self.hdfs = PyWebHdfsClient(host=hadoopHost,
                                    port=hadopPort,
                                    user_name=user)
        self.s3_client = boto3.client('s3')

    def copyToHDFS(self, src_path, hdfs_path):
        if hdfs_path.startswith("hdfs"):
            temp_path = hdfs_path.split("8020")
            self.new_hdfs_path = temp_path[1] + '/lib'
            print "New Path: %s" % self.new_hdfs_path
        # create a new client instance
        # print "New Path: %s" % self.new_hdfs_path[1]
        jar_name = os.path.basename(src_path)
        print src_path
        fileContent = open(src_path, 'rb').read()

        # copies file to local for testing purpose
        # with open("E:/temp/java-0.0.2.jar", "wb") as jarfile:
        #     jarfile.write(fileContent)

        # create a new file on hdfs
        print('making new file at: {0}\n'.format(jar_name))
        result = self.hdfs.create_file(self.new_hdfs_path + "/" + jar_name,
                                       fileContent,
                                       overwrite=True)
        print "HDFS Copy Result: %s" % result
        return result

    def list_hdfs_dir(self, hdfs_path):
        print self.hdfs.list_dir(hdfs_path)
Ejemplo n.º 3
0
    def load(self, job, task, fifo):
        self.job = job
        self.task = task
        self.fifo = fifo
        self.key = None
        self.script_proc = None
        self.decompress_obj = None
        self.pycurl_callback_exception = None

        if task.data['scheme'] == 's3':
            self.is_anonymous = job.spec.source.aws_access_key is None or job.spec.source.aws_secret_key is None
            if self.is_anonymous:
                s3_conn = S3Connection(anon=True)
            else:
                s3_conn = S3Connection(job.spec.source.aws_access_key,
                                       job.spec.source.aws_secret_key)
            bucket = s3_conn.get_bucket(task.data['bucket'])

            try:
                self.key = bucket.get_key(task.data['key_name'])
            except S3ResponseError as e:
                raise WorkerException(
                    "Received %s %s accessing `%s`, aborting" %
                    (e.status, e.reason, task.data['key_name']))
        elif task.data['scheme'] == 'hdfs':
            fname = task.data['key_name']
            client = PyWebHdfsClient(job.spec.source.hdfs_host,
                                     job.spec.source.webhdfs_port,
                                     user_name=job.spec.source.hdfs_user)
            try:
                filesize = client.get_file_dir_status(
                    fname)['FileStatus']['length']
            except pywebhdfs.errors.FileNotFound:
                raise WorkerException("File '%s' does not exist on HDFS" %
                                      fname)
            self.key = AttrDict({'name': fname, 'size': filesize})
        elif task.data['scheme'] == 'file':
            globber = glob2.Globber()
            fname = globber._normalize_string(task.data['key_name'])

            if not os.path.exists(fname):
                raise WorkerException(
                    "File '%s' does not exist on this filesystem" % fname)
            elif not os.path.isfile(fname):
                raise WorkerException("File '%s' exists, but is not a file" %
                                      fname)

            self.key = AttrDict({
                'name': fname,
                'size': os.path.getsize(fname)
            })
        else:
            raise WorkerException('Unsupported job with paths: %s' %
                                  [str(p) for p in self.job.paths])

        if self.key is None:
            raise WorkerException(
                'Failed to find key associated with task ID %s' % task.task_id)

        self.metrics = DownloadMetrics(self.key.size)
Ejemplo n.º 4
0
def upload_file():
    """
    Upload file
    ---
    tags:
        - Files
    consumes: "multipart/form-data"
    parameters:
        -   name: file
            in: formData
            required: true
            paramType: body
            dataType: file
            type: file
    responses:
        200:
            description: Return a successful message
        401:
            description: Unauthorized
        400:
            description: Bad Request
        500:
            description: Server Internal error
    """
    # hard-code config information. You should imporove it.
    hdfs = PyWebHdfsClient(host='webhdfs',port='50070', user_name='thanhson1085')
    if request.method == 'POST':
        file = request.files['file']
        if file and allowed_file(file.filename):
            filename = secure_filename(str(time.time()) + file.filename)
            my_file = 'tmp/thanhson1085/data/' + filename
            hdfs.create_file(my_file, file)
            return jsonify({'success':'true'})

    return jsonify({'success':'false'})
Ejemplo n.º 5
0
    def ship_udf(ic, function, hdfs_path=None, udf_name=None, database=None,
            overwrite=False):
        # extract some information from the function
        if udf_name is None:
            udf_name = function.name
        symbol = function.llvm_func.name
        ir = function.llvm_module.to_bitcode()
        return_type = udf_to_impala_type[function.signature.return_type.name]
        arg_types = [udf_to_impala_type[arg.name]
                        for arg in function.signature.args[1:]]

        # ship the IR to the cluster
        hdfs_client = PyWebHdfsClient(host=ic._nn_host, port=ic._webhdfs_port,
                user_name=ic._hdfs_user)
        if hdfs_path is None:
            hdfs_path = os.path.join(ic._temp_dir, udf_name + '.ll')
        if not hdfs_path.endswith('.ll'):
            raise ValueError("The HDFS file name must end with .ll")
        hdfs_client.create_file(hdfs_path.lstrip('/'), ir, overwrite=overwrite)

        # register the function in Impala
        if database is None:
            database = ic._temp_db
        impala_name = '%s.%s(%s)' % (database, udf_name, ', '.join(arg_types))
        if overwrite:
            ic._cursor.execute("DROP FUNCTION IF EXISTS %s" % impala_name)
        register_query = "CREATE FUNCTION %s RETURNS %s LOCATION '%s' SYMBOL='%s'" % (impala_name,
                return_type, hdfs_path, symbol)
        ic._cursor.execute(register_query)
Ejemplo n.º 6
0
class WhenTestingDeleteOperation(unittest.TestCase):

    def setUp(self):

        self.host = 'hostname'
        self.port = '00000'
        self.user_name = 'username'
        self.webhdfs = PyWebHdfsClient(host=self.host, port=self.port,
                                       user_name=self.user_name)
        self.response = MagicMock()
        self.requests = MagicMock(return_value=self.response)
        self.path = 'user/hdfs/old_dir'
        self.response = MagicMock()

    def test_rename_throws_exception_for_not_ok(self):

        self.response.status_code = httplib.BAD_REQUEST
        self.requests.delete.return_value = self.response
        with patch('pywebhdfs.webhdfs.requests', self.requests):
            with self.assertRaises(errors.PyWebHdfsException):
                self.webhdfs.delete_file_dir(self.path)

    def test_rename_returns_true(self):

        self.response.status_code = httplib.OK
        self.requests.delete.return_value = self.response
        with patch('pywebhdfs.webhdfs.requests', self.requests):
            result = self.webhdfs.delete_file_dir(self.path)
        self.assertTrue(result)
Ejemplo n.º 7
0
    def __init__(self, host, port, user, logger):
        self._hdfs = PyWebHdfsClient(
            host=host, port=port, user_name=user, timeout=None)

        global LOGGER
        LOGGER = logger
        LOGGER.debug('webhdfs = %s@%s:%s', user, host, port)
Ejemplo n.º 8
0
class WhenTestingDeleteXattrOperation(unittest.TestCase):
    def setUp(self):
        self.host = 'hostname'
        self.port = '00000'
        self.user_name = 'username'
        self.webhdfs = PyWebHdfsClient(host=self.host, port=self.port,
                                       user_name=self.user_name)
        self.path = 'user/hdfs/old_dir'
        self.xattr = 'user.test'
        self.response = MagicMock()

    @patch.object(Session, 'put')
    def test_delete_xattr_throws_exception_for_not_ok(self, mock_put):
        self.response.status_code = http_client.BAD_REQUEST
        mock_put.return_value = self.response
        with self.assertRaises(errors.PyWebHdfsException):
            self.webhdfs.delete_xattr(self.path, self.xattr)

    @patch.object(Session, 'put')
    def test_delete_xattr_returns_true(self, mock_put):
        self.response.status_code = http_client.OK
        mock_put.return_value = self.response
        result = self.webhdfs.delete_xattr(self.path, self.xattr)

        self.assertTrue(result)
Ejemplo n.º 9
0
class WhenTestingOpenOperation(unittest.TestCase):
    def setUp(self):

        self.host = 'hostname'
        self.port = '00000'
        self.user_name = 'username'
        self.webhdfs = PyWebHdfsClient(host=self.host,
                                       port=self.port,
                                       user_name=self.user_name)
        self.response = MagicMock()
        self.requests = MagicMock(return_value=self.response)
        self.path = 'user/hdfs'
        self.file_data = u'010101'
        self.response = MagicMock()
        self.response.content = self.file_data

    def test_read_throws_exception_for_not_ok(self):

        self.response.status_code = http_client.BAD_REQUEST
        self.requests.get.return_value = self.response
        with patch('pywebhdfs.webhdfs.requests', self.requests):
            with self.assertRaises(errors.PyWebHdfsException):
                self.webhdfs.read_file(self.path)

    def test_read_returns_file(self):

        self.response.status_code = http_client.OK
        self.requests.get.return_value = self.response
        with patch('pywebhdfs.webhdfs.requests', self.requests):
            result = self.webhdfs.read_file(self.path)
        self.assertEqual(result, self.file_data)
Ejemplo n.º 10
0
class WhenTestingGetXattrOperation(unittest.TestCase):
    def setUp(self):
        self.host = 'hostname'
        self.port = '00000'
        self.user_name = 'username'
        self.webhdfs = PyWebHdfsClient(host=self.host, port=self.port,
                                       user_name=self.user_name)
        self.path = 'user/hdfs/old_dir'
        self.xattr = 'user.test'
        self.response = MagicMock()
        self.file_status = {
            "XAttrs": [
                {
                    "name": self.xattr,
                    "value": "1"
                }
            ]
        }
        self.response.json = MagicMock(return_value=self.file_status)

    @patch.object(Session, 'get')
    def test_get_xattr_throws_exception_for_not_ok(self, mock_get):
        self.response.status_code = http_client.BAD_REQUEST
        mock_get.return_value = self.response
        with self.assertRaises(errors.PyWebHdfsException):
            self.webhdfs.get_xattr(self.path, self.xattr)

    @patch.object(Session, 'get')
    def test_get_xattr_returns_true(self, mock_get):
        self.response.status_code = http_client.OK
        mock_get.return_value = self.response
        result = self.webhdfs.get_xattr(self.path, self.xattr)

        for key in result:
            self.assertEqual(result[key], self.file_status[key])
Ejemplo n.º 11
0
class WhenTestingListXattrsOperation(unittest.TestCase):
    def setUp(self):
        self.host = 'hostname'
        self.port = '00000'
        self.user_name = 'username'
        self.webhdfs = PyWebHdfsClient(host=self.host, port=self.port,
                                       user_name=self.user_name)
        self.path = 'user/hdfs/old_dir'
        self.response = MagicMock()
        self.file_status = {
            "XAttrNames":
                [
                    "[\"XATTRNAME1\",\"XATTRNAME2\",\"XATTRNAME3\"]"
                ]
        }
        self.response.json = MagicMock(return_value=self.file_status)

    @patch.object(Session, 'get')
    def test_list_xattrs_throws_exception_for_not_ok(self, mock_get):
        self.response.status_code = http_client.BAD_REQUEST
        mock_get.return_value = self.response
        with self.assertRaises(errors.PyWebHdfsException):
            self.webhdfs.list_xattrs(self.path)

    @patch.object(Session, 'get')
    def test_list_xattrs_returns_true(self, mock_get):
        self.response.status_code = http_client.OK
        mock_get.return_value = self.response
        result = self.webhdfs.list_xattrs(self.path)

        for key in result:
            self.assertEqual(result[key], self.file_status[key])
Ejemplo n.º 12
0
class WhenTestingDeleteOperation(unittest.TestCase):
    def setUp(self):

        self.host = 'hostname'
        self.port = '00000'
        self.user_name = 'username'
        self.webhdfs = PyWebHdfsClient(host=self.host,
                                       port=self.port,
                                       user_name=self.user_name)
        self.response = MagicMock()
        self.requests = MagicMock(return_value=self.response)
        self.path = 'user/hdfs/old_dir'
        self.response = MagicMock()

    def test_rename_throws_exception_for_not_ok(self):

        self.response.status_code = http_client.BAD_REQUEST
        self.requests.delete.return_value = self.response
        with patch('pywebhdfs.webhdfs.requests', self.requests):
            with self.assertRaises(errors.PyWebHdfsException):
                self.webhdfs.delete_file_dir(self.path)

    def test_rename_returns_true(self):

        self.response.status_code = http_client.OK
        self.requests.delete.return_value = self.response
        with patch('pywebhdfs.webhdfs.requests', self.requests):
            result = self.webhdfs.delete_file_dir(self.path)
        self.assertTrue(result)
Ejemplo n.º 13
0
class WhenTestingGetFileChecksumOperation(unittest.TestCase):
    def setUp(self):
        self.host = 'hostname'
        self.port = '00000'
        self.user_name = 'username'
        self.webhdfs = PyWebHdfsClient(host=self.host, port=self.port,
                                       user_name=self.user_name)
        self.path = 'user/hdfs/old_dir'
        self.response = MagicMock()
        self.file_checksum = {
            "FileChecksum": {
                "algorithm": "MD5-of-1MD5-of-512CRC32",
                "bytes": ("000002000000000000000000729a144ad5e9399f70c9bedd757"
                          "2e6bf00000000"),
                "length": 28
            }
        }
        self.response.json = MagicMock(return_value=self.file_checksum)

    @patch.object(Session, 'get')
    def test_get_status_throws_exception_for_not_ok(self, mock_get):
        self.response.status_code = http_client.BAD_REQUEST
        mock_get.return_value = self.response
        with self.assertRaises(errors.PyWebHdfsException):
            self.webhdfs.get_file_checksum(self.path)

    @patch.object(Session, 'get')
    def test_get_status_returns_true(self, mock_get):
        self.response.status_code = http_client.OK
        mock_get.return_value = self.response
        result = self.webhdfs.get_file_checksum(self.path)

        for key in result:
            self.assertEqual(result[key], self.file_checksum[key])
Ejemplo n.º 14
0
class WhenTestingRenameOperation(unittest.TestCase):
    def setUp(self):
        self.host = 'hostname'
        self.port = '00000'
        self.user_name = 'username'
        self.webhdfs = PyWebHdfsClient(host=self.host, port=self.port,
                                       user_name=self.user_name)
        self.response = MagicMock()
        self.path = 'user/hdfs/old_dir'
        self.new_path = '/user/hdfs/new_dir'
        self.response = MagicMock()
        self.rename = {"boolean": True}
        self.response.json = MagicMock(return_value=self.rename)

    @patch.object(Session, 'put')
    def test_rename_throws_exception_for_not_ok(self, mock_put):
        self.response.status_code = http_client.BAD_REQUEST
        mock_put.return_value = self.response
        with self.assertRaises(errors.PyWebHdfsException):
            self.webhdfs.rename_file_dir(self.path, self.new_path)

    @patch.object(Session, 'put')
    def test_rename_returns_true(self, mock_put):
        self.response.status_code = http_client.OK
        mock_put.return_value = self.response
        result = self.webhdfs.rename_file_dir(self.path, self.new_path)
        self.assertEqual(result, {"boolean": True})
Ejemplo n.º 15
0
    def setUp(self):

        self.host = 'hostname'
        self.port = '00000'
        self.user_name = 'username'
        self.webhdfs = PyWebHdfsClient(host=self.host,
                                       port=self.port,
                                       user_name=self.user_name)
        self.response = MagicMock()
        self.requests = MagicMock(return_value=self.response)
        self.path = 'user/hdfs/old_dir'
        self.response = MagicMock()
        self.file_status = {
            "FileStatus": {
                "accessTime": 0,
                "blockSize": 0,
                "group": "supergroup",
                "length": 0,
                "modificationTime": 1320173277227,
                "owner": "webuser",
                "pathSuffix": "",
                "permission": "777",
                "replication": 0,
                "type": "DIRECTORY"
            }
        }
        self.response.json = MagicMock(return_value=self.file_status)
Ejemplo n.º 16
0
class WhenTestingOpenOperation(unittest.TestCase):
    def setUp(self):
        self.host = 'hostname'
        self.port = '00000'
        self.user_name = 'username'
        self.webhdfs = PyWebHdfsClient(host=self.host, port=self.port,
                                       user_name=self.user_name)
        self.path = 'user/hdfs'
        self.file_data = u'010101'
        self.response = MagicMock()
        self.response.content = self.file_data

    @patch.object(Session, 'get')
    def test_read_throws_exception_for_not_ok(self, mock_get):
        self.response.status_code = http_client.BAD_REQUEST
        mock_get.return_value = self.response
        with self.assertRaises(errors.PyWebHdfsException):
            self.webhdfs.read_file(self.path)

    @patch.object(Session, 'get')
    def test_read_returns_file(self, mock_get):
        self.response.status_code = http_client.OK
        mock_get.return_value = self.response
        result = self.webhdfs.read_file(self.path)
        self.assertEqual(result, self.file_data)

    @patch.object(Session, 'get')
    def test_stream_returns_generator(self, mock_get):
        self.response.status_code = http_client.OK
        mock_get.return_value = self.response
        result = self.webhdfs.stream_file(self.path)
        self.assertIsInstance(result, types.GeneratorType)
Ejemplo n.º 17
0
 def setUp(self):
     self.host = 'hostname'
     self.port = '00000'
     self.user_name = 'username'
     self.path = 'user/hdfs'
     self.webhdfs = PyWebHdfsClient(host=self.host, port=self.port,
                                    user_name=self.user_name)
Ejemplo n.º 18
0
def to_hdfs(file_path, hdfs_path):
    hdfs = PyWebHdfsClient(host='hdfs-v1',
                           port='50070',
                           user_name='hdfs',
                           timeout=100)
    with open(file_path, 'rb') as f:
        hdfs.create_file(hdfs_path, f, overwrite=True)
Ejemplo n.º 19
0
class WhenTestingRenameOperation(unittest.TestCase):

    def setUp(self):

        self.host = 'hostname'
        self.port = '00000'
        self.user_name = 'username'
        self.webhdfs = PyWebHdfsClient(host=self.host, port=self.port,
                                       user_name=self.user_name)
        self.response = MagicMock()
        self.requests = MagicMock(return_value=self.response)
        self.path = 'user/hdfs/old_dir'
        self.new_path = '/user/hdfs/new_dir'
        self.response = MagicMock()
        self.rename = {"boolean": True}
        self.response.json = MagicMock(return_value=self.rename)

    def test_rename_throws_exception_for_not_ok(self):

        self.response.status_code = http_client.BAD_REQUEST
        self.requests.put.return_value = self.response
        with patch('pywebhdfs.webhdfs.requests', self.requests):
            with self.assertRaises(errors.PyWebHdfsException):
                self.webhdfs.rename_file_dir(self.path, self.new_path)

    def test_rename_returns_true(self):

        self.response.status_code = http_client.OK
        self.requests.put.return_value = self.response
        with patch('pywebhdfs.webhdfs.requests', self.requests):
            result = self.webhdfs.rename_file_dir(self.path, self.new_path)
        self.assertEqual(result, {"boolean": True})
def solarLog_call(epoch_time):
    conn = http.client.HTTPConnection("")
    r = requests.get(" http://winsun.solarlog-web.ch/api?cid=" + pfadheimBaarCID + "&locale=de_ch&username=277555406&password=5a03cdf0a3ff42de09bc85361d8a2f0f&function=dashboard&format=jsonh&solarlog=9112&tiles=Yield|true,Grafic|true,Env|true,Weather|true&ctime=" + epoch_time)
    logging.info("Response: " + str(r.status_code) + " " + r.reason)

    data = r.json()  # This will return entire content.
    data['timestamp'] = epoch_time
    # Remove key's with complex JSON structure
    del data['cur_production_per_wrid']
    del data['invEnergyType']
    #del data['decimalseperator']
    logging.debug(data)

    #write data to .json
    with open('/home/claude/repo/bda-solar/data/data_timestamp/pfadibaar_solarlog_' + epoch_time + '.json', 'w', encoding='utf-8') as outfile:
        json.dump(data, outfile, indent=4, ensure_ascii=False)

    #write the same data as .csv since it is more easy to handel with hdfs..
    with open('/home/claude/repo/bda-solar/data/data_timestamp/pfadibaar_solarlog_' + epoch_time + '.csv', 'w') as f:  # Just use 'w' mode in 3.x
        w = csv.DictWriter(f, data.keys(), dialect=csv.excel_tab)
        w.writeheader()
        w.writerow(data)

    # write the same data as .csv since it is more easy to handel with hdfs..
    hdfs = PyWebHdfsClient(host='193.246.208.147', port='50079', user_name='hdfs')
    #hdfs_path = 'user/hdfs/from_python'
    hdfs.create_file('user/hdfs/pfadibaar_solarlog.csv', '0100')
    #with open('pfadibaar_solarlog_' + epoch_time + '.csv') as file_data:
    #    hdfs.create_file(hdfs_path, data=file_data)

    conn.close()
Ejemplo n.º 21
0
class WhenTestingCreateUri(unittest.TestCase):

    def setUp(self):
        self.host = 'hostname'
        self.port = '00000'
        self.user_name = 'username'
        self.path = 'user/hdfs'
        self.webhdfs = PyWebHdfsClient(host=self.host, port=self.port,
                                       user_name=self.user_name)

    def test_create_uri_no_kwargs(self):
        op = operations.CREATE
        uri = 'http://{host}:{port}/webhdfs/v1/' \
              '{path}?op={op}&user.name={user}'\
            .format(
                host=self.host, port=self.port, path=self.path,
                op=op, user=self.user_name)
        result = self.webhdfs._create_uri(self.path, op)
        self.assertEqual(uri, result)

    def test_create_uri_with_kwargs(self):
        op = operations.CREATE
        mykey = 'mykey'
        myval = 'myval'
        uri = 'http://{host}:{port}/webhdfs/v1/' \
              '{path}?op={op}&{key}={val}' \
              '&user.name={user}' \
            .format(
                host=self.host, port=self.port, path=self.path,
                op=op, key=mykey, val=myval, user=self.user_name)
        result = self.webhdfs._create_uri(self.path, op, mykey=myval)
        self.assertEqual(uri, result)
Ejemplo n.º 22
0
def setup_common_oozie_libs(name_node):
    webhdfs_port = '14000'
    webhdfs_user = '******'
    platform_dir = 'user/deployment/platform'
    lib_path_list = ['/opt/cloudera/parcels/CDH/lib/hbase/hbase-client.jar',
                     '/opt/cloudera/parcels/CDH/lib/hbase/hbase-common.jar',
                     '/opt/cloudera/parcels/CDH/lib/hbase/hbase-protocol.jar',
                     '/opt/cloudera/parcels/CDH/lib/hbase/hbase-server.jar',
                     '/opt/cloudera/parcels/CDH/lib/hbase/lib/htrace-core.jar',
                     '/opt/cloudera/parcels/CDH/lib/hbase/hbase-hadoop-compat.jar',
                     '/opt/cloudera/parcels/CDH/lib/hbase/hbase-it.jar',
                     '/opt/cloudera/parcels/CDH/lib/hbase/hbase-prefix-tree.jar',
                     '/opt/cloudera/parcels/CDH/lib/hbase/lib/zookeeper.jar',
                     '/opt/cloudera/parcels/CDH/lib/pig/piggybank.jar',
                     '/opt/cloudera/parcels/CDH/lib/spark/lib/spark-examples.jar']

    # Setup a connection with hdfs using namenode.
    hdfs_client = PyWebHdfsClient(host=name_node, port=webhdfs_port, user_name=webhdfs_user, timeout=None)
    # Create directory on hadoop file system (HDFS).
    hdfs_client.make_dir(platform_dir)
    # Creates a new file on HDFS and write contents from local FS.
    for path in lib_path_list:
        platform_file = '%s/%s' % (platform_dir, os.path.basename(path))
        logging.info('Copying source file: %s to HDFS path %s', path, platform_file)
        with open(path) as file_data:
            hdfs_client.create_file(platform_file, file_data, overwrite=True)
Ejemplo n.º 23
0
class WhenTestingDeleteXattrOperation(unittest.TestCase):

    def setUp(self):

        self.host = 'hostname'
        self.port = '00000'
        self.user_name = 'username'
        self.webhdfs = PyWebHdfsClient(host=self.host, port=self.port,
                                       user_name=self.user_name)
        self.response = MagicMock()
        self.requests = MagicMock(return_value=self.response)
        self.path = 'user/hdfs/old_dir'
        self.xattr = 'user.test'
        self.response = MagicMock()

    def test_delete_xattr_throws_exception_for_not_ok(self):

        self.response.status_code = http_client.BAD_REQUEST
        self.requests.return_value = self.response
        with patch('requests.sessions.Session.put', self.requests):
            with self.assertRaises(errors.PyWebHdfsException):
                self.webhdfs.delete_xattr(self.path, self.xattr)

    def test_delete_xattr_returns_true(self):

        self.response.status_code = http_client.OK
        self.requests.return_value = self.response
        with patch('requests.sessions.Session.put', self.requests):
            result = self.webhdfs.delete_xattr(self.path, self.xattr)

        self.assertTrue(result)
Ejemplo n.º 24
0
class WhenTestingCreateUri(unittest.TestCase):
    def setUp(self):
        self.host = 'hostname'
        self.port = '00000'
        self.user_name = 'username'
        self.path = 'user/hdfs'
        self.webhdfs = PyWebHdfsClient(host=self.host,
                                       port=self.port,
                                       user_name=self.user_name)

    def test_create_uri_no_kwargs(self):
        op = operations.CREATE
        uri = 'http://{host}:{port}/webhdfs/v1/' \
              '{path}?op={op}&user.name={user}'\
            .format(
                host=self.host, port=self.port, path=self.path,
                op=op, user=self.user_name)
        result = self.webhdfs._create_uri(self.path, op)
        self.assertEqual(uri, result)

    def test_create_uri_with_kwargs(self):
        op = operations.CREATE
        mykey = 'mykey'
        myval = 'myval'
        uri = 'http://{host}:{port}/webhdfs/v1/' \
              '{path}?op={op}&{key}={val}' \
              '&user.name={user}' \
            .format(
                host=self.host, port=self.port, path=self.path,
                op=op, key=mykey, val=myval, user=self.user_name)
        result = self.webhdfs._create_uri(self.path, op, mykey=myval)
        self.assertEqual(uri, result)
Ejemplo n.º 25
0
class WhenTestingOpenOperation(unittest.TestCase):

    def setUp(self):

        self.host = 'hostname'
        self.port = '00000'
        self.user_name = 'username'
        self.webhdfs = PyWebHdfsClient(host=self.host, port=self.port,
                                       user_name=self.user_name)
        self.response = MagicMock()
        self.requests = MagicMock(return_value=self.response)
        self.path = 'user/hdfs'
        self.file_data = '010101'
        self.response = MagicMock()
        self.response.text = self.file_data

    def test_read_throws_exception_for_not_ok(self):

        self.response.status_code = httplib.BAD_REQUEST
        self.requests.get.return_value = self.response
        with patch('pywebhdfs.webhdfs.requests', self.requests):
            with self.assertRaises(errors.PyWebHdfsException):
                self.webhdfs.read_file(self.path)

    def test_read_returns_file(self):

        self.response.status_code = httplib.OK
        self.requests.get.return_value = self.response
        with patch('pywebhdfs.webhdfs.requests', self.requests):
            result = self.webhdfs.read_file(self.path)
        self.assertEqual(result, self.file_data)
Ejemplo n.º 26
0
class WhenTestingListDirOperation(unittest.TestCase):

    def setUp(self):

        self.host = 'hostname'
        self.port = '00000'
        self.user_name = 'username'
        self.webhdfs = PyWebHdfsClient(host=self.host, port=self.port,
                                       user_name=self.user_name)
        self.path = 'user/hdfs/old_dir'
        self.response = MagicMock()
        self.file_status = {
            "FileStatuses": {
                "FileStatus": [
                    {
                        "accessTime": 0,
                        "blockSize": 0,
                        "group": "supergroup",
                        "length": 24930,
                        "modificationTime": 1320173277227,
                        "owner": "webuser",
                        "pathSuffix": "a.patch",
                        "permission": "777",
                        "replication": 0,
                        "type": "FILE"
                    },
                    {
                        "accessTime": 0,
                        "blockSize": 0,
                        "group": "supergroup",
                        "length": 0,
                        "modificationTime": 1320173277227,
                        "owner": "webuser",
                        "pathSuffix": "",
                        "permission": "777",
                        "replication": 0,
                        "type": "DIRECTORY"
                    }
                ]
            }
        }
        self.response.json = MagicMock(return_value=self.file_status)

    @patch.object(Session, 'get')
    def test_get_status_throws_exception_for_not_ok(self, mock_get):

        self.response.status_code = http_client.BAD_REQUEST
        mock_get.return_value = self.response
        with self.assertRaises(errors.PyWebHdfsException):
            self.webhdfs.list_dir(self.path)

    @patch.object(Session, 'get')
    def test_get_status_returns_true(self, mock_get):

        self.response.status_code = http_client.OK
        mock_get.return_value = self.response
        result = self.webhdfs.list_dir(self.path)

        for key in result:
            self.assertEqual(result[key], self.file_status[key])
Ejemplo n.º 27
0
def downParts(fpath):
    '从hdfs下载spark输出的文件, 以 part-0??? 格式的系列文件, 下载的时候直接聚合到一个文件中'
    from pywebhdfs.webhdfs import PyWebHdfsClient
    hdfs = PyWebHdfsClient(host='iasp76', port='12003', user_name='mci')
    flist = hdfs.list_dir(fpath)
    x = flist['FileStatuses']['FileStatus']
    _SUCCESS = False
    for f in x:
        if f['pathSuffix'] == '_SUCCESS':
            _SUCCESS = True
            break
    if not _SUCCESS:
        print("not complete yet!")
        return
    fnames = [
        f['pathSuffix'] for f in x if f['pathSuffix'].startswith('part-')
    ]
    fnames1 = sorted(fnames)
    foutname = fpath[fpath.rfind('/') + 1:]
    l = len(fnames1)
    with open(foutname, "wb") as fo:
        for fname in fnames1:
            fpath1 = fpath + "/" + fname
            fo.write(hdfs.read_file(fpath1))
            print(" progress: ", fname, l)
Ejemplo n.º 28
0
    def ship_udf(ic, function, hdfs_path=None, udf_name=None, database=None,
                 overwrite=False):
        # extract some information from the function
        if udf_name is None:
            udf_name = function.name
        symbol = function.llvm_func.name
        ir = function.llvm_module.to_bitcode()
        return_type = udf_to_impala_type[function.signature.return_type.name]
        arg_types = [udf_to_impala_type[arg.name]
                     for arg in function.signature.args[1:]]

        # ship the IR to the cluster
        hdfs_client = PyWebHdfsClient(host=ic._nn_host, port=ic._webhdfs_port,
                                      user_name=ic._hdfs_user)
        if hdfs_path is None:
            hdfs_path = os.path.join(ic._temp_dir, udf_name + '.ll')
        if not hdfs_path.endswith('.ll'):
            raise ValueError("The HDFS file name must end with .ll")
        hdfs_client.create_file(hdfs_path.lstrip('/'), ir, overwrite=overwrite)

        # register the function in Impala
        if database is None:
            database = ic._temp_db
        impala_name = '%s.%s(%s)' % (database, udf_name, ', '.join(arg_types))
        if overwrite:
            ic._cursor.execute("DROP FUNCTION IF EXISTS %s" % impala_name)
        register_query = ("CREATE FUNCTION %s RETURNS %s "
                          "LOCATION '%s' SYMBOL='%s'") % (impala_name,
                                                          return_type,
                                                          hdfs_path, symbol)
        ic._cursor.execute(register_query)
Ejemplo n.º 29
0
Archivo: hdfs.py Proyecto: bkanuka/pymc
class HDFS(NDArray):
	'''
	HDFS storage

	Parameters
	----------
	name : str
		Name of directory to store text files (Path to the directory) without
		a leading '/'
	model : Model
		If None, the model is taken from the 'with' context
	vars : list of variables
		Sampling values will be stored for these variables. If None.
		'model.unobserved_RVs' is used
	host : str
		The IP address or hostname of the HDFS namenode. By default,
		it is 'localhost'
	port : str
		The port number for WebHDFS on the namenode. By default, it
		is '50070'
	user_name : str
		WebHDFS user_name used for authentication. By default, it is
		None
	'''
	def __init__(self, name, model=None, vars=None, host='localhost', port='50070', user_name=None):
		self.hdfs = PyWebHdfsClient(host=host, port=port, user_name=user_name)
		try:
			self.hdfs.list_dir(name)
		except FileNotFound:
			self.hdfs.make_dir(name)
		super(HDFS, self).__init__(name, model, vars)

	def close(self):
		super(HDFS, self).close()
		_dump_trace(self.name, self)
Ejemplo n.º 30
0
    def submit(self, bund, files=[]):
        hdfs = PyWebHdfsClient(host=os.environ["WEBHDFS_HOST"], port='14000', user_name='oozie')

        for f in files:
            hdfs.create_file("{}/{}".format(bund.path, f.name), f.read())  

        doc, tag, text = Doc().tagtext()
        with tag("configuration"):
            with tag("property"):
                with tag("name"):
                    text("user.name")
                with tag("value"):
                    text("oozie")

            with tag("property"):
                with tag("name"):
                    text("oozie.bundle.application.path")
                with tag("value"):
                    text("/"+bund.path + "/" + bund.name)

        configuration = doc.getvalue()
        response = post("{0}/oozie/v1/jobs".format(self.url), data=configuration, headers={'Content-Type': 'application/xml'})

        if response.status_code > 399:
            print response.headers["oozie-error-message"]
        print response.status_code
        print response.content
Ejemplo n.º 31
0
def saveToStore(path,meta):
    con=happybase.Connection(MasterHbase)
    con.open()
    metaTable= con.table('MetaTable')
    if meta['size'] < largeSize:
        # save to Hbase
        encTable = con.table('EncTable')
        with open(path,'rb') as f:
            encTable.put(meta['rowkey'],{'enc:data': f.read()})
        metaTable.put(str(meta['rowkey']),{
                'pp:name': str(meta['filename']),
                'pp:checksum': str(meta['checksum']),
                'pp:size': str(meta['size']),
                'pp:often': str(meta['often']),
                'pp:des': str(meta['description'])
                }
              )
        app.logger.debug('%s is saved to Hbase',meta['rowkey'])
    else:
        # save to HDFS
        hdfs = PyWebHdfsClient(host=Master,port='50070', timeout=None,user_name='hduser')
        with open(path, 'rb') as f:
            hdfs.create_file(HDFSMainPath+meta['rowkey'], f)
        metaTable.put(str(meta['rowkey']),{
                'pp:name': str(meta['filename']),
                'pp:checksum': str(meta['checksum']),
                'pp:size': str(meta['size']),
                'pp:HDFSpath': str(HDFSMainPath + meta['rowkey']),
                'pp:often': str(meta['often']),
                'pp:des': str(meta['description'])
                }
              )
        app.logger.debug('%s is saved to HDFS',meta['rowkey'])
    con.close()
Ejemplo n.º 32
0
def save_extracted_subgraph(elements, args: application_args):
    pair, subgraph, _ = elements
    path = args.get_folder_results_path()
    hdfs = PyWebHdfsClient(host=args.hdfs_host, port=args.hdfs_port)
    file = os.path.join(path, f"graph_{str(pair[0])}_{str(pair[1])}")
    pickled = pkl.dumps(subgraph)
    hdfs.create_file(file, pickled, overwrite=True)
Ejemplo n.º 33
0
 def close(self):
     # drop the temp database
     self._cursor.execute('USE %s' % self._temp_db)
     self._cursor.execute('SHOW TABLES')
     temp_tables = [x[0] for x in self._cursor.fetchall()]
     for table in temp_tables:
         self._cursor.execute('DROP TABLE IF EXISTS %s.%s' % (self._temp_db, table))
     self._cursor.execute('SHOW FUNCTIONS')
     temp_udfs = [x[1] for x in self._cursor.fetchall()]
     for udf in temp_udfs:
         self._cursor.execute('DROP FUNCTION IF EXISTS %s.%s' % (self._temp_db, udf))
     self._cursor.execute('SHOW AGGREGATE FUNCTIONS')
     temp_udas = [x[1] for x in self._cursor.fetchall()]
     for uda in temp_udas:
         self._cursor.execute('DROP AGGREGATE FUNCTION IF EXISTS %s.%s' % (self._temp_db, uda))
     self._cursor.execute('USE default')
     self._cursor.execute('DROP DATABASE IF EXISTS %s' % self._temp_db)
     # drop the temp dir in HDFS
     try:
         from requests.exceptions import ConnectionError
         from pywebhdfs.webhdfs import PyWebHdfsClient
         hdfs_client = PyWebHdfsClient(host=self._nn_host,
             port=self._webhdfs_port, user_name=self._hdfs_user)
         hdfs_client.delete_file_dir(self._temp_dir.lstrip('/'), recursive=True)
     except ImportError:
         import sys
         sys.stderr.write("Could not import requests or pywebhdfs. "
             "You must delete the temporary directory manually: %s" % self._temp_dir)
     except ConnectionError:
         import sys
         sys.stderr.write("Could not connect via pywebhdfs. "
             "You must delete the temporary directory manually: %s" % self._temp_dir)
Ejemplo n.º 34
0
 def test_create_throws_exception_for_not_created(self, mock_put):
     webhdfs = PyWebHdfsClient(host=self.host, port=self.port,
                               user_name=self.user_name)
     self.init_response.status_code = http_client.TEMPORARY_REDIRECT
     self.response.status_code = http_client.BAD_REQUEST
     mock_put.side_effect = [self.init_response, self.response]
     with self.assertRaises(errors.PyWebHdfsException):
         webhdfs.create_file(self.path, self.file_data)
Ejemplo n.º 35
0
 def get_file_contents(self,
                       hdfs_path,
                       user_name='trifacta',
                       httpfs_port='14000'):
     hdfs = PyWebHdfsClient(host=urlparse(self.trifacta_base_url).netloc,
                            port=httpfs_port,
                            user_name=user_name)
     return hdfs.read_file(hdfs_path).decode('utf-8')
Ejemplo n.º 36
0
def from_hdfs(hdfs_path, file_path):
    hdfs = PyWebHdfsClient(host='hdfs-v1',
                           port='50070',
                           user_name='hdfs',
                           timeout=100)
    binary_file = hdfs.read_file(hdfs_path)
    with open(file_path, 'wb') as f:
        f.write(binary_file)
Ejemplo n.º 37
0
    def run(self):
        if ("agg" in self.arg):
            #reading from a file to memory to stream later
            with open(self.path,"rb") as f:
                self.data_holder['data'] = json.dumps(cPickle.load(f));
            #indicating that reading in memory is finished for this data  
            self.data_holder["indicator"]='ready'; 
        
        elif("raw" in self.arg):
            from pywebhdfs.webhdfs import PyWebHdfsClient;
            hdfs = PyWebHdfsClient(host='cshadoop.boisestate.edu',port='50070', user_name='uacharya');
            
            file_path = 'user/uacharya/flow/'+str(self.arg['d'])+'/node_'+str(self.arg['n'])+'/output.csv'
            #reading the csv files in the memory
            self.data_holder['data']= hdfs.read_file(file_path,buffersize=4096) 
                
            self.data_holder["indicator"]='ready'; 
            
        elif("bitmap" in self.arg):
            #putting the line data into a object to stream
            with open(self.path+"/data.json","rb")as f:
                self.data_holder['data'] = json.dumps(cPickle.load(f));          
#             with open(self.path+"\\data.json","rb")as f:
#                 output = cPickle.load(f);  
            #not loading images into memory if there is none images
            if(self.data_holder['data']=='""'):
                #indicating that reading in memory is finished for this data  
                self.data_holder['frames']=(0,[]);
                self.data_holder["indicator"]='ready'; 
                return;
#             if(not output):
#                 self.data_holder['data']= msgpack.packb(output,use_bin_type=True);
#                 self.data_holder["indicator"]='ready'; 
#                 return;     
            #just in case there is some data to stream add all the PNGS to a list   
#             output['frames']=[];
            content_length =0; #calculate the content length in bytes of all images to stream in total
            PNGS=[]; #list to hold all the pngs data in memory
            #reading all the images to memory to stream
            for x in xrange(1,31):
                buf_string = cStringIO.StringIO();
                Image.open(self.path+"/imgs/"+str(x)+".png").save(buf_string, format="PNG", quality=100);
                content_length = content_length+(buf_string.tell()+4); 
                PNGS.append(struct.pack('>I',buf_string.tell())+buf_string.getvalue());
                buf_string.close();
#             for x in xrange(1,31):
#                 buf_string = cStringIO.StringIO();
#                 Image.open(self.path+"\\imgs\\"+str(x)+".png").save(buf_string, format="PNG", quality=100);
#                 output['frames'].append(buf_string.getvalue());
#                 buf_string.close();
                
            self.data_holder['frames']=(content_length,PNGS);
#             self.data_holder['data']=msgpack.packb(output,use_bin_type=True);
            #indicating that reading in memory is finished for this data  
            self.data_holder["indicator"]='ready'; 
                
        else:
            raise InvalidFormatError("the type of format is not available to read in memory");
Ejemplo n.º 38
0
 def setUp(self):
     self.host = 'hostname'
     self.port = '00000'
     self.user_name = 'username'
     self.webhdfs = PyWebHdfsClient(host=self.host, port=self.port,
                                    user_name=self.user_name)
     self.response = MagicMock()
     self.requests = MagicMock(return_value=self.response)
     self.path = 'user/hdfs/old_dir'
Ejemplo n.º 39
0
 def setUp(self):
     self.host = 'hostname'
     self.port = '00000'
     self.user_name = 'username'
     self.webhdfs = PyWebHdfsClient(host=self.host, port=self.port,
                                    user_name=self.user_name)
     self.path = 'user/hdfs/old_dir'
     self.xattr = 'user.test'
     self.response = MagicMock()
Ejemplo n.º 40
0
 def setUp(self):
     self.host = 'hostname'
     self.port = '00000'
     self.user_name = 'username'
     self.webhdfs = PyWebHdfsClient(host=self.host, port=self.port,
                                    user_name=self.user_name)
     self.path = 'user/hdfs'
     self.file_data = u'010101'
     self.response = MagicMock()
     self.response.content = self.file_data
Ejemplo n.º 41
0
 def test_create_returns_file_location(self, mock_put):
     webhdfs = PyWebHdfsClient(host=self.host, port=self.port,
                               user_name=self.user_name)
     self.init_response.status_code = http_client.TEMPORARY_REDIRECT
     self.response.status_code = http_client.CREATED
     mock_put.side_effect = [self.init_response, self.response]
     result = webhdfs.create_file(self.path, self.file_data)
     self.assertTrue(result)
     mock_put.assert_called_with(
         self.location, headers=self.expected_headers, data=self.file_data)
Ejemplo n.º 42
0
def main(argv):
    """
    Main method.

    This method performs the following tasks:
    1. Parse command line arguments
    2. Retrieve credentials and connect to Cloudant and WebHDFS
    3. Connect to the Cloudant `_changes` feed for checkpointed document
       consumption
    4. Process each change individually.
    5. Upon exception throwing, store the latest checkpoint to local file and
       exit.
    """

    # add options into the parser
    parser = configureOptions()
    (options, args) = parser.parse_args()
    checkRequiredArguments(options, parser)
    print options

    # configurations
    last_seq = options.last_seq

    # get credential
    perm_file = "%s/.clou" % os.environ["HOME"]
    creds = get_creds(perm_file)

    # connect to source database
    s = Server("https://%s:%s@%s" % (creds["cloudant_user"], creds["cloudant_pwd"], options.uri))
    db = s[options.dbname]
    # print db.info()

    # connect to target hdfs cluster
    hdfs = PyWebHdfsClient(host=options.hdfs_host, port=options.hdfs_port, user_name=creds["hdfs_user"])
    hdfs.make_dir(options.hdfs_path)

    # and here we consume the cloudant `_changes` feed
    counter = 0
    changestream = ChangesStream(db, include_docs=True, heartbeat=True, since=last_seq)
    for c in changestream:
        # print c
        try:
            if counter % 100 == 0:
                checkpoint(last_seq)
            seq = processChange(hdfs, c, options.hdfs_path)
            if seq:  # protect against the last line being blank
                last_seq = seq
                counter += 1
        except Exception:
            traceback.print_exc()
            checkpoint(last_seq)
            os._exit(1)

    checkpoint(last_seq)
Ejemplo n.º 43
0
    def __init__(self, remote=None, namenodes=None, **kwargs):
        self.remote = remote
        self.namenodes = namenodes or []

        PyWebHdfsClient.__init__(self, **kwargs)

        if self.namenodes and 'path_to_hosts' not in kwargs:
            self.path_to_hosts = [('.*', self.namenodes)]

        # Override base uri
        self.base_uri_pattern = kwargs.get('base_uri_pattern', "http://{host}/webhdfs/v1/").format(
            host="{host}")
Ejemplo n.º 44
0
class WhenTestingCreateUri(unittest.TestCase):

    def setUp(self):
        self.host = 'hostname'
        self.port = '00000'
        self.user_name = 'username'
        self.path = 'user/hdfs'
        self.webhdfs = PyWebHdfsClient(host=self.host, port=self.port,
                                       user_name=self.user_name)

    def test_create_uri_no_kwargs(self):
        op = operations.CREATE
        uri = 'http://{{host}}:{port}/webhdfs/v1/' \
              '{path}?op={op}&user.name={user}'\
              .format(port=self.port, path=self.path,
                      op=op, user=self.user_name)
        result = self.webhdfs._create_uri(self.path, op)
        self.assertEqual(uri, result)

    def test_create_uri_with_kwargs(self):
        op = operations.CREATE
        mykey = 'mykey'
        myval = 'myval'
        uri = 'http://{{host}}:{port}/webhdfs/v1/' \
              '{path}?op={op}&{key}={val}' \
              '&user.name={user}' \
              .format(
                  port=self.port, path=self.path,
                  op=op, key=mykey, val=myval, user=self.user_name)
        result = self.webhdfs._create_uri(self.path, op,
                                          mykey=myval)
        self.assertEqual(uri, result)

    def test_create_uri_with_leading_slash(self):
        op = operations.CREATE
        uri_path_no_slash = self.webhdfs._create_uri(self.path, op)
        uri_path_with_slash = self.webhdfs._create_uri('/' + self.path, op)
        self.assertEqual(uri_path_no_slash, uri_path_with_slash)

    def test_create_uri_with_unicode_path(self):
        op = operations.CREATE
        mykey = 'mykey'
        myval = 'myval'
        path = u'die/Stra\xdfe'
        quoted_path = 'die/Stra%C3%9Fe'
        uri = 'http://{{host}}:{port}/webhdfs/v1/' \
              '{path}?op={op}&{key}={val}' \
              '&user.name={user}' \
              .format(
                  port=self.port, path=quoted_path,
                  op=op, key=mykey, val=myval, user=self.user_name)
        result = self.webhdfs._create_uri(path, op, mykey=myval)
        self.assertEqual(uri, result)
Ejemplo n.º 45
0
class WhenTestingFileExistsOperation(unittest.TestCase):

    def setUp(self):

        self.host = 'hostname'
        self.port = '00000'
        self.user_name = 'username'
        self.webhdfs = PyWebHdfsClient(host=self.host, port=self.port,
                                       user_name=self.user_name)
        self.response = MagicMock()
        self.requests = MagicMock(return_value=self.response)
        self.path = 'user/hdfs/old_dir'
        self.response = MagicMock()
        self.file_status = {
            "FileStatus": {
                "accessTime": 0,
                "blockSize": 0,
                "group": "supergroup",
                "length": 0,
                "modificationTime": 1320173277227,
                "owner": "webuser",
                "pathSuffix": "",
                "permission": "777",
                "replication": 0,
                "type": "DIRECTORY"
            }
        }
        self.response.json = MagicMock(return_value=self.file_status)

    def test_exists_throws_exception_for_error(self):

        self.response.status_code = http_client.BAD_REQUEST
        self.requests.get.return_value = self.response
        with patch('pywebhdfs.webhdfs.requests', self.requests):
            with self.assertRaises(errors.PyWebHdfsException):
                self.webhdfs.exists_file_dir(self.path)

    def test_exists_returns_true(self):

        self.response.status_code = http_client.OK
        self.requests.get.return_value = self.response
        with patch('pywebhdfs.webhdfs.requests', self.requests):
            self.assertTrue(self.webhdfs.exists_file_dir(self.path))

    def test_exists_returns_false(self):

        self.response.status_code = http_client.NOT_FOUND
        self.requests.get.return_value = self.response
        with patch('pywebhdfs.webhdfs.requests', self.requests):
            self.assertFalse(self.webhdfs.exists_file_dir(self.path))
Ejemplo n.º 46
0
class WhenTestingCreateOperation(unittest.TestCase):

    def setUp(self):

        self.host = 'hostname'
        self.port = '00000'
        self.user_name = 'username'
        self.webhdfs = PyWebHdfsClient(host=self.host, port=self.port,
                                       user_name=self.user_name)
        self.response = MagicMock()
        self.requests = MagicMock(return_value=self.response)
        self.location = 'redirect_uri'
        self.path = 'user/hdfs'
        self.file_data = '010101'
        self.init_response = MagicMock()
        self.init_response.headers = {'location': self.location}
        self.response = MagicMock()
        self.expected_headers = {'content-type': 'application/octet-stream'}

    def test_create_throws_exception_for_no_redirect(self):

        self.init_response.status_code = httplib.BAD_REQUEST
        self.response.status_code = httplib.CREATED
        self.requests.put.side_effect = [self.init_response, self.response]
        with patch('pywebhdfs.webhdfs.requests', self.requests):
            with self.assertRaises(errors.PyWebHdfsException):
                self.webhdfs.create_file(self.path, self.file_data)

    def test_create_throws_exception_for_not_created(self):

        self.init_response.status_code = httplib.TEMPORARY_REDIRECT
        self.response.status_code = httplib.BAD_REQUEST
        self.requests.put.side_effect = [self.init_response, self.response]
        with patch('pywebhdfs.webhdfs.requests', self.requests):
            with self.assertRaises(errors.PyWebHdfsException):
                self.webhdfs.create_file(self.path, self.file_data)

    def test_create_returns_file_location(self):

        self.init_response.status_code = httplib.TEMPORARY_REDIRECT
        self.response.status_code = httplib.CREATED
        self.put_method = MagicMock(
            side_effect=[self.init_response, self.response])
        self.requests.put = self.put_method
        with patch('pywebhdfs.webhdfs.requests', self.requests):
            result = self.webhdfs.create_file(self.path, self.file_data)
        self.assertTrue(result)
        self.put_method.assert_called_with(
            self.location, headers=self.expected_headers, data=self.file_data)
Ejemplo n.º 47
0
def create_data_from_station_data(first, second):
    """this function creates the data analyzing the two stations in comparison"""
    global hdfs; #global hdfs object
    global hbase; #global hbase object
    
    if(hdfs is None): 
        from pywebhdfs.webhdfs import PyWebHdfsClient; 
        hdfs = PyWebHdfsClient(host='cshadoop.boisestate.edu',port='50070', user_name='uacharya'); 
   
    if(hbase is None):
        import happybase;
        hbase = happybase.ConnectionPool(size=1,host='cshadoop.boisestate.edu');
 
    date_for_comparision = first["Date"].strip();

   # creating directory for each date
    try:
        hdfs.get_file_dir_status('user/uacharya/simulation/'+date_for_comparision);
    except Exception:
        # directory to hold dataset in csv file for reach node in wall display starting from 1 to 9    
        for index in range(1, 10):
            content = 'Date,ID,Source,Destination,S_Lat,S_Lon,D_Lat,D_Lon,Wind_Lat,Wind_Lon,Wind_Velocity\n';
            try:
                hdfs.create_file('user/uacharya/simulation/'+date_for_comparision+'/node'+str(index)+'/output.csv',content,replication=1);
            except Exception:
                continue;
   
    
    dataset = {'node_1':[],'node_2':[],'node_3':[],'node_4':[],'node_5':[],'node_6':[],'node_7':[],'node_8':[],'node_9':[]};
   
    for data in broadcast_variable.value:
        compare_data_between(date_for_comparision, first, data,dataset);

#    for key in dataset:
#        if(len(dataset[key])!=0):
#            content = "\n".join(dataset[key]);
#            content +="\n";
#            while(True):
#                try:
#                    hdfs.append_file('user/uacharya/simulation/'+date+'/'+key+'/output.csv',content,buffersize=4096);
#                    break;
#                except Exception:
#                    time.sleep(0.2);
#                    continue;

    
    dataset.clear(); #clearing the dictionary
    # append over here after all the global variable has been made        
    return second;
Ejemplo n.º 48
0
    def load(self, job, task, fifo):
        self.job = job
        self.task = task
        self.fifo = fifo
        self.key = None
        self.script_proc = None
        self.decompress_obj = None
        self.pycurl_callback_exception = None

        if task.data['scheme'] == 's3':
            self.is_anonymous = job.spec.source.aws_access_key is None or job.spec.source.aws_secret_key is None
            if self.is_anonymous:
                s3_conn = S3Connection(anon=True)
            else:
                s3_conn = S3Connection(job.spec.source.aws_access_key, job.spec.source.aws_secret_key)
            bucket = s3_conn.get_bucket(task.data['bucket'])

            try:
                self.key = bucket.get_key(task.data['key_name'])
            except S3ResponseError as e:
                raise WorkerException("Received %s %s accessing `%s`, aborting" % (e.status, e.reason, task.data['key_name']))
        elif task.data['scheme'] == 'hdfs':
            fname = task.data['key_name']
            client = PyWebHdfsClient(
                job.spec.source.hdfs_host,
                job.spec.source.webhdfs_port,
                user_name=job.spec.source.hdfs_user)
            try:
                filesize = client.get_file_dir_status(fname)['FileStatus']['length']
            except pywebhdfs.errors.FileNotFound:
                raise WorkerException("File '%s' does not exist on HDFS" % fname)
            self.key = AttrDict({'name': fname, 'size': filesize})
        elif task.data['scheme'] == 'file':
            globber = glob2.Globber()
            fname = globber._normalize_string(task.data['key_name'])

            if not os.path.exists(fname):
                raise WorkerException("File '%s' does not exist on this filesystem" % fname)
            elif not os.path.isfile(fname):
                raise WorkerException("File '%s' exists, but is not a file" % fname)

            self.key = AttrDict({'name': fname, 'size': os.path.getsize(fname)})
        else:
            raise WorkerException('Unsupported job with paths: %s' % [ str(p) for p in self.job.paths ])

        if self.key is None:
            raise WorkerException('Failed to find key associated with task ID %s' % task.task_id)

        self.metrics = DownloadMetrics(self.key.size)
Ejemplo n.º 49
0
    def setUp(self):

        self.host = 'hostname'
        self.port = '00000'
        self.user_name = 'username'
        self.webhdfs = PyWebHdfsClient(host=self.host, port=self.port,
                                       user_name=self.user_name)
        self.response = MagicMock()
        self.requests = MagicMock(return_value=self.response)
        self.path = 'user/hdfs/old_dir'
        self.response = MagicMock()
        self.file_status = {
            "FileStatus": {
                "accessTime": 0,
                "blockSize": 0,
                "group": "supergroup",
                "length": 0,
                "modificationTime": 1320173277227,
                "owner": "webuser",
                "pathSuffix": "",
                "permission": "777",
                "replication": 0,
                "type": "DIRECTORY"
            }
        }
        self.response.json = MagicMock(return_value=self.file_status)
Ejemplo n.º 50
0
 def setUp(self):
     self.host = 'hostname'
     self.port = '00000'
     self.user_name = 'username'
     self.path = 'user/hdfs'
     self.webhdfs = PyWebHdfsClient(host=self.host, port=self.port,
                                    user_name=self.user_name)
Ejemplo n.º 51
0
Archivo: hdfs.py Proyecto: bkanuka/pymc
	def __init__(self, name, model=None, vars=None, host='localhost', port='50070', user_name=None):
		self.hdfs = PyWebHdfsClient(host=host, port=port, user_name=user_name)
		try:
			self.hdfs.list_dir(name)
		except FileNotFound:
			self.hdfs.make_dir(name)
		super(HDFS, self).__init__(name, model, vars)
Ejemplo n.º 52
0
def update_raw_stage(output, delivery_tag):

    #context = zmq.Context()

    #confirm = context.socket(zmq.PUSH)
    #confirm.connect(confirm_host)

    hdfs = PyWebHdfsClient(host=webhdfs_host, port=webhdfs_port, user_name=webhdfs_user)
    impala_conn = connect(host=impala_host, port=int(impala_port))
    cur = impala_conn.cursor()

    start_time = time.time()

    for k, v in output.iteritems():

        if (time.time() - start_time)/60 > sink_minutes:
            sink_logger.warning('ETL process running longer then sink timeout: {0} minutes'.format((time.time() - start_time)/60))
        try:
            file_name = 'user/impala/test_log_1/raw_log_{0}.txt'.format(k)
            hdfs.append_file(file_name, '\n'.join(v))
            cur.execute('refresh test_log_{0}'.format(k))

        except hdfs_err.PyWebHdfsException:
            file_name = 'user/impala/test_log_1/raw_log_{0}.txt'.format(k)
            hdfs.create_file(file_name, '')
            hdfs.append_file(file_name, '\n'.join(v))
            cur.execute('refresh test_log_{0}'.format(k))

    #confirm.send(delivery_tag)
    sink_logger.info('ETL process finished for {0} minutes'.format((time.time() - start_time)/60))
    sink_logger.info('ETL process finished with {0} delivery_tag'.format(delivery_tag))
Ejemplo n.º 53
0
def setup_common_oozie_libs(name_node):
    webhdfs_port = '14000'
    webhdfs_user = '******'
    platform_dir = 'user/deployment/platform'
    lib_path_list = ['/usr/hdp/current/hbase-client/lib/hbase-client.jar',
                     '/usr/hdp/current/hbase-client/lib/hbase-common.jar',
                     '/usr/hdp/current/hbase-client/lib/hbase-protocol.jar',
                     '/usr/hdp/current/hbase-client/lib/hbase-server.jar',
                     '/usr/hdp/current/hbase-client/lib/htrace-core-3.1.0-incubating.jar',
                     '/usr/hdp/current/hbase-client/lib/hbase-hadoop-compat.jar',
                     '/usr/hdp/current/hbase-client/lib/hbase-it.jar',
                     '/usr/hdp/current/hbase-client/lib/hbase-prefix-tree.jar',
                     '/usr/hdp/current/hbase-client/lib/zookeeper.jar',
                     '/usr/hdp/current/pig-client/piggybank.jar',
                     '/usr/hdp/current/spark-client/lib/spark-examples.jar']

    # Setup a connection with hdfs using namenode.
    hdfs_client = PyWebHdfsClient(host=name_node, port=webhdfs_port, user_name=webhdfs_user, timeout=None)
    # Create directory on hadoop file system (HDFS).
    hdfs_client.make_dir(platform_dir)
    # Creates a new file on HDFS and write contents from local FS.
    for path in lib_path_list:
        platform_file = '%s/%s' % (platform_dir, os.path.basename(path))
        print 'Copying source file: %s to HDFS path %s' % (path, platform_file)
        with open(path) as file_data:
            try:
                hdfs_client.create_file(platform_file, file_data, overwrite=True)
            except PyWebHdfsException:
                print 'retrying HDFS copy command for %s' % platform_file
                time.sleep(5)
                hdfs_client.create_file(platform_file, file_data, overwrite=True)
Ejemplo n.º 54
0
Archivo: hdfs.py Proyecto: bkanuka/pymc
def load(name, chains=None, model=None, host='localhost', port='50070', user_name=None):
	'''
	Load text database

	Parameters
	----------
	name : str
		Path to root directory in HDFS for text database without a leading '/'
	chains : list
		Chains to load. If None, all chains are loaded
	model : Model
		If None, the model is taken from the 'with' context
	host : str
		The IP address or hostname of the HDFS namenode. By default,
		it is 'localhost'
	port : str
		The port number for WebHDFS on the namenode. By default, it
		is '50070'
	user_name : str
		WebHDFS user_name used for authentication. By default, it is
		None

	Returns
	-------
	ndarray.Trace instance
	'''
	hdfs = PyWebHdfsClient(host=host, port=port, user_name=user_name)
	chain_dirs = _get_chain_dirs(name, hdfs)
	if chains is None:
		chains = list(chain_dirs.keys())
	traces = []
	for chain in chains:
		chain_dir = chain_dirs[chain]
		dir_path = os.path.join(name, chain_dir)
		shape_file = os.path.join(dir_path, 'shapes.json')
		shapes = json.load(StringIO.StringIO(hdfs.read_file(shape_file)))
		samples = {}
		for varname, shape in shapes.items():
			var_file = os.path.join(dir_path, varname + '.txt')
			samples[varname] = np.loadtxt(StringIO.StringIO(str(hdfs.read_file(var_file)))).reshape(shape)
		trace = NDArray(model=model)
		trace.samples = samples
		trace.chain = chain
		traces.append(trace)
	return base.MultiTrace(traces)
Ejemplo n.º 55
0
def from_pandas(ic, df, table=None, path=None, method='in_query',
        file_format='TEXTFILE', field_terminator='\t', line_terminator='\n',
        escape_char='\\',
        hdfs_host=None, webhdfs_port=50070, hdfs_user=None, overwrite=False):
    """Create a BDF by shipping an in-memory pandas `DataFrame` into Impala
    
    path is the dir, not the filename
    """
    # TODO: this is not atomic
    temp_table = _random_id('tmp_table_', 8)
    if table is None:
        table = "%s.%s" % (ic._temp_db, temp_table)
    if path is None:
        path = os.path.join(ic._temp_dir, temp_table)
    table_name = _to_TableName(table)
    if overwrite:
        ic._cursor.execute("DROP TABLE IF EXISTS %s" % table_name.to_sql())
    columns = list(df.columns)
    types = [_numpy_dtype_to_impala_PrimitiveType(ty) for ty in df.dtypes]
    schema = zip(columns, types)
    create_stmt = _create_table(table_name, schema, path=path,
            file_format=file_format, field_terminator=field_terminator,
            line_terminator=line_terminator, escape_char=escape_char)
    ic._cursor.execute(create_stmt)
    if method == 'in_query':
        query = "INSERT INTO %s VALUES " % table_name.to_sql()
        query += ', '.join(['(%s)' % ', '.join(map(_py_to_sql_string, row)) for row in df.values])
        ic._cursor.execute(query)
    elif method == 'webhdfs':
        if file_format != 'TEXTFILE':
            raise ValueError("only TEXTFILE format supported for webhdfs")
        if path is None:
            raise ValueError("must supply a path for EXTERNAL table for webhdfs")
        from pywebhdfs.webhdfs import PyWebHdfsClient
        hdfs_client = PyWebHdfsClient(host=hdfs_host, port=webhdfs_port,
                user_name=hdfs_user)
        raw_data = StringIO()
        df.to_csv(raw_data, sep=field_terminator,
                line_terminator=line_terminator, quoting=csv.QUOTE_NONE, escapechar=escape_char, header=False, index=False)
        hdfs_client.create_file(os.path.join(path, 'data.txt').lstrip('/'), raw_data.getvalue(), overwrite=overwrite)
        raw_data.close()
    else:
        raise ValueError("method must be 'in_query' or 'webhdfs'; got %s" % method)
    return from_sql_table(ic, table_name.to_sql())
Ejemplo n.º 56
0
class WhenTestingAppendOperation(unittest.TestCase):

    def setUp(self):

        self.host = 'hostname'
        self.port = '00000'
        self.user_name = 'username'
        self.webhdfs = PyWebHdfsClient(host=self.host, port=self.port,
                                       user_name=self.user_name)
        self.response = MagicMock()
        self.requests = MagicMock(return_value=self.response)
        self.location = 'redirect_uri'
        self.path = 'user/hdfs'
        self.file_data = '010101'
        self.init_response = MagicMock()
        self.init_response.header = {'location': self.location}
        self.response = MagicMock()

    def test_append_throws_exception_for_no_redirect(self):

        self.init_response.status_code = httplib.BAD_REQUEST
        self.response.status_code = httplib.OK
        self.requests.post.side_effect = [self.init_response, self.response]
        with patch('pywebhdfs.webhdfs.requests', self.requests):
            with self.assertRaises(errors.PyWebHdfsException):
                self.webhdfs.append_file(self.path, self.file_data)

    def test_append_throws_exception_for_not_ok(self):

        self.init_response.status_code = httplib.TEMPORARY_REDIRECT
        self.response.status_code = httplib.BAD_REQUEST
        self.requests.post.side_effect = [self.init_response, self.response]
        with patch('pywebhdfs.webhdfs.requests', self.requests):
            with self.assertRaises(errors.PyWebHdfsException):
                self.webhdfs.append_file(self.path, self.file_data)

    def test_append_returns_true(self):

        self.init_response.status_code = httplib.TEMPORARY_REDIRECT
        self.response.status_code = httplib.OK
        self.requests.post.side_effect = [self.init_response, self.response]
        with patch('pywebhdfs.webhdfs.requests', self.requests):
            result = self.webhdfs.append_file(self.path, self.file_data)
        self.assertTrue(result)
Ejemplo n.º 57
0
class WhenTestingGetContentSummaryOperation(unittest.TestCase):

    def setUp(self):

        self.host = 'hostname'
        self.port = '00000'
        self.user_name = 'username'
        self.webhdfs = PyWebHdfsClient(host=self.host, port=self.port,
                                       user_name=self.user_name)
        self.response = MagicMock()
        self.requests = MagicMock(return_value=self.response)
        self.path = 'user/hdfs/old_dir'
        self.response = MagicMock()
        self.file_status = {
            "ContentSummary": {
                "directoryCount": 2,
                "fileCount": 1,
                "length": 24930,
                "quota": -1,
                "spaceConsumed": 24930,
                "spaceQuota": -1
            }
        }
        self.response.json = MagicMock(return_value=self.file_status)

    def test_get_status_throws_exception_for_not_ok(self):

        self.response.status_code = http_client.BAD_REQUEST
        self.requests.get.return_value = self.response
        with patch('pywebhdfs.webhdfs.requests', self.requests):
            with self.assertRaises(errors.PyWebHdfsException):
                self.webhdfs.get_content_summary(self.path)

    def test_get_status_returns_true(self):

        self.response.status_code = http_client.OK
        self.requests.get.return_value = self.response
        with patch('pywebhdfs.webhdfs.requests', self.requests):
            result = self.webhdfs.get_content_summary(self.path)

        for key in result:
            self.assertEqual(result[key], self.file_status[key])
Ejemplo n.º 58
0
class Store (store.Store):
    """
    HDFS backed store.
    """

    def __init__ (self):
        """ Connect to store """
        self._client = PyWebHdfsClient(host=store_host, port=store_port, user_name=store_user)

    def mkdir (self, path):
        self._client.make_dir(path)

    def read (self, path, open_handle):
        return StoreFile(self._client, path, "r", open_handle)

    def append (self, path, open_handle):
        return StoreFile(self._client, path, "a", open_handle)

    def write (self, path, open_handle):
        return StoreFile(self._client, path, "w", open_handle)

    def exists (self, path):
        try:
            dirinfo = self._client.list_dir(path)
            return True
        except errors.FileNotFound:
            return False
    
    def walk (self, path, visitor, recursive = False):
        """ Walk files in a path. Use recursive=True to include subdirs """
        dirinfo = self._client.list_dir(path)
        for status in dirinfo["FileStatuses"]["FileStatus"]:
            if recursive and status["type"] == "DIRECTORY":
                if len(path) > 0:
                    self.walk(path + "/" + status["pathSuffix"], visitor, recursive)
                else:
                    self.walk(status["pathSuffix"], visitor, recursive)
            else:
                info = dict(name=status["pathSuffix"], 
                            modify=datetime.fromtimestamp(status["modificationTime"]), 
                            size=status["length"])
                visitor(path, info)
Ejemplo n.º 59
0
    def setUp(self):

        self.host = 'hostname'
        self.port = '00000'
        self.user_name = 'username'
        self.webhdfs = PyWebHdfsClient(host=self.host, port=self.port,
                                       user_name=self.user_name)
        self.response = MagicMock()
        self.requests = MagicMock(return_value=self.response)
        self.path = 'user/hdfs/old_dir'
        self.response = MagicMock()