class WhenTestingListDirOperation(unittest.TestCase): def setUp(self): self.host = 'hostname' self.port = '00000' self.user_name = 'username' self.webhdfs = PyWebHdfsClient(host=self.host, port=self.port, user_name=self.user_name) self.path = 'user/hdfs/old_dir' self.response = MagicMock() self.file_status = { "FileStatuses": { "FileStatus": [ { "accessTime": 0, "blockSize": 0, "group": "supergroup", "length": 24930, "modificationTime": 1320173277227, "owner": "webuser", "pathSuffix": "a.patch", "permission": "777", "replication": 0, "type": "FILE" }, { "accessTime": 0, "blockSize": 0, "group": "supergroup", "length": 0, "modificationTime": 1320173277227, "owner": "webuser", "pathSuffix": "", "permission": "777", "replication": 0, "type": "DIRECTORY" } ] } } self.response.json = MagicMock(return_value=self.file_status) @patch.object(Session, 'get') def test_get_status_throws_exception_for_not_ok(self, mock_get): self.response.status_code = http_client.BAD_REQUEST mock_get.return_value = self.response with self.assertRaises(errors.PyWebHdfsException): self.webhdfs.list_dir(self.path) @patch.object(Session, 'get') def test_get_status_returns_true(self, mock_get): self.response.status_code = http_client.OK mock_get.return_value = self.response result = self.webhdfs.list_dir(self.path) for key in result: self.assertEqual(result[key], self.file_status[key])
class HDFS(NDArray): ''' HDFS storage Parameters ---------- name : str Name of directory to store text files (Path to the directory) without a leading '/' model : Model If None, the model is taken from the 'with' context vars : list of variables Sampling values will be stored for these variables. If None. 'model.unobserved_RVs' is used host : str The IP address or hostname of the HDFS namenode. By default, it is 'localhost' port : str The port number for WebHDFS on the namenode. By default, it is '50070' user_name : str WebHDFS user_name used for authentication. By default, it is None ''' def __init__(self, name, model=None, vars=None, host='localhost', port='50070', user_name=None): self.hdfs = PyWebHdfsClient(host=host, port=port, user_name=user_name) try: self.hdfs.list_dir(name) except FileNotFound: self.hdfs.make_dir(name) super(HDFS, self).__init__(name, model, vars) def close(self): super(HDFS, self).close() _dump_trace(self.name, self)
class HdfsHandler: def __init__(self, hadoopHost, hadopPort='50070', user='******'): # self.hdfs = PyWebHdfsClient(host='52.14.121.163', port='50070', user_name='hadoop') self.hdfs = PyWebHdfsClient(host=hadoopHost, port=hadopPort, user_name=user) self.s3_client = boto3.client('s3') def copyToHDFS(self, src_path, hdfs_path): if hdfs_path.startswith("hdfs"): temp_path = hdfs_path.split("8020") self.new_hdfs_path = temp_path[1] + '/lib' print "New Path: %s" % self.new_hdfs_path # create a new client instance # print "New Path: %s" % self.new_hdfs_path[1] jar_name = os.path.basename(src_path) print src_path fileContent = open(src_path, 'rb').read() # copies file to local for testing purpose # with open("E:/temp/java-0.0.2.jar", "wb") as jarfile: # jarfile.write(fileContent) # create a new file on hdfs print('making new file at: {0}\n'.format(jar_name)) result = self.hdfs.create_file(self.new_hdfs_path + "/" + jar_name, fileContent, overwrite=True) print "HDFS Copy Result: %s" % result return result def list_hdfs_dir(self, hdfs_path): print self.hdfs.list_dir(hdfs_path)
def downParts(fpath): '从hdfs下载spark输出的文件, 以 part-0??? 格式的系列文件, 下载的时候直接聚合到一个文件中' from pywebhdfs.webhdfs import PyWebHdfsClient hdfs = PyWebHdfsClient(host='iasp76', port='12003', user_name='mci') flist = hdfs.list_dir(fpath) x = flist['FileStatuses']['FileStatus'] _SUCCESS = False for f in x: if f['pathSuffix'] == '_SUCCESS': _SUCCESS = True break if not _SUCCESS: print("not complete yet!") return fnames = [ f['pathSuffix'] for f in x if f['pathSuffix'].startswith('part-') ] fnames1 = sorted(fnames) foutname = fpath[fpath.rfind('/') + 1:] l = len(fnames1) with open(foutname, "wb") as fo: for fname in fnames1: fpath1 = fpath + "/" + fname fo.write(hdfs.read_file(fpath1)) print(" progress: ", fname, l)
class Store (store.Store): """ HDFS backed store. """ def __init__ (self): """ Connect to store """ self._client = PyWebHdfsClient(host=store_host, port=store_port, user_name=store_user) def mkdir (self, path): self._client.make_dir(path) def read (self, path, open_handle): return StoreFile(self._client, path, "r", open_handle) def append (self, path, open_handle): return StoreFile(self._client, path, "a", open_handle) def write (self, path, open_handle): return StoreFile(self._client, path, "w", open_handle) def exists (self, path): try: dirinfo = self._client.list_dir(path) return True except errors.FileNotFound: return False def walk (self, path, visitor, recursive = False): """ Walk files in a path. Use recursive=True to include subdirs """ dirinfo = self._client.list_dir(path) for status in dirinfo["FileStatuses"]["FileStatus"]: if recursive and status["type"] == "DIRECTORY": if len(path) > 0: self.walk(path + "/" + status["pathSuffix"], visitor, recursive) else: self.walk(status["pathSuffix"], visitor, recursive) else: info = dict(name=status["pathSuffix"], modify=datetime.fromtimestamp(status["modificationTime"]), size=status["length"]) visitor(path, info)
class HdfsHandler(object): def __init__(self): self._HDFS = PyWebHdfsClient(host='10.81.1.160', port='50070', user_name='hdfs') def readFile(self, file): dirToRead = "%s/%s" % (LOG_ROOT_DIR, file) dataOut = self._HDFS.list_dir(dirToRead) fileToRead = "%s/%s" % ( dirToRead, dataOut['FileStatuses']['FileStatus'][1]['pathSuffix']) return self._HDFS.read_file(fileToRead)
# checksum reflects file changes file_checksum = hdfs.get_file_checksum(example_file) print(file_checksum) # read in the data for the file print('reading data from file at: {0}\n'.format(example_file)) file_data = hdfs.read_file(example_file) print(file_data) # rename the example_dir print('renaming directory from {0} to {1}\n').format(example_dir, rename_dir) hdfs.rename_file_dir(example_dir, '/{0}'.format(rename_dir)) # list the contents of the new directory listdir_stats = hdfs.list_dir(rename_dir) print(listdir_stats) example_file = '{dir}/example.txt'.format(dir=rename_dir) # delete the example file print('deleting example file at: {0}'.format(example_file)) hdfs.delete_file_dir(example_file) # list the contents of the directory listdir_stats = hdfs.list_dir(rename_dir) print(listdir_stats) # delete the example directory print('deleting the example directory at: {0}'.format(rename_dir)) hdfs.delete_file_dir(rename_dir, recursive='true')
from pywebhdfs.webhdfs import PyWebHdfsClient import logging from pprint import pprint logging.basicConfig(level=logging.DEBUG) _LOG = logging.getLogger(__name__) #host= your server address. hdfs = PyWebHdfsClient(host='',port='50070', user_name='hduser',timeout=4) # your Namenode IP & username here my_dir = '/user/hduser/sample' fileFinal=my_dir+'/file.txt' pprint(hdfs.list_dir(my_dir)) dir_status = hdfs.get_file_dir_status(my_dir) print dir_status print "Reading file from hadoop hdfs" file_data = hdfs.read_file("user/hduser/sample/file.txt") print file_data
if __name__ == '__main__': global data_creator; #one object to hold all the data to stream to the client global date_list; #one list object to hold all the dates whose data are stored in HDFS #starting the new thread to run server separately server = Thread(target=runServer); server.start(); #creating wall coordinates list for use later in this view DataCreator.read_transformed_coordinates_to_array(); #instance that creates data in different format for each date data_creator = DataCreator(); date_list=[]; from pywebhdfs.webhdfs import PyWebHdfsClient; hdfs = PyWebHdfsClient(host='cshadoop.boisestate.edu',port='50070', user_name='uacharya'); dir_listing = hdfs.list_dir('user/uacharya/flow') #list of dir dictionaries ls_dir = dir_listing['FileStatuses']['FileStatus'] #appending to the list all the date which data is stored in hdfs for d in ls_dir: date_list.append(int(d['pathSuffix'])) #creating required data in memory for all the data available for date in date_list: print("started creaing data for date {}".format(date)) data_creator.create_data_for_date(date); print(data_creator.check_available_data(date,aggregated=True)) print(data_creator.check_available_data(date,bitmap=True))
from os.path import join from pywebhdfs.webhdfs import PyWebHdfsClient def formatLine(line): line = line.values() line[0], line[5] = line[5], line[0] return ', '.join(line) if __name__ == '__main__': host = 'hdfs://localhost:9000' ticker_path = host + '/user/hadoop/tickers.txt' save_path = host + '/user/hadoop/stock' hdfs = PyWebHdfsClient(host='localhost', port='50070', user_name='hadoop') folder = hdfs.list_dir('user/hadoop/stock')['FileStatuses']['FileStatus'] files = sorted([dt.datetime.strptime(f['pathSuffix'].split('.')[0], '%Y-%m-%d').date() for f in folder]) end = dt.date.today().strftime('%Y-%m-%d') sc = SparkContext(appName='stock_data') if len(files) > 3: hdfs.delete_file_dir(join(save_path, files[0].strftime('%Y-%m-%d') + '.csv'), recursive=True) if len(files) == 0: start = '2014-01-01' stockData = sc.textFile(ticker_path).flatMap(lambda x: Share(x).get_historical(start, end)).map(formatLine) stockData.saveAsTextFile(join(save_path, end + '.csv')) else: start = (files[-1] + dt.timedelta(days=1)).strftime('%Y-%m-%d') histStockData = sc.textFile(join(save_path, files[-1].strftime('%Y-%m-%d') + '.csv'))
def prepare_for_CPP_converter(task_name, in_dir, ofile_path, hdfs_host, hdfs_port, hdfs_user): hdfs_client = PyWebHdfsClient(host=hdfs_host, port=str(hdfs_port), user_name=hdfs_user) file_statuses = hdfs_client.list_dir(in_dir)["FileStatuses"]["FileStatus"] # pprint(file_statuses) for file_status in file_statuses: file_list = sorted([ file_status["pathSuffix"] for file_status in file_statuses if "COPY" not in file_status["pathSuffix"] ], key=lambda x: int(x.split(".")[-1])) ofile = open(ofile_path, "wb") req_buffer = defaultdict(list) finished_log_idx, n_req_total, last_write_ts, obj_id_mapping = -1, 0, 0, {} for ifile_name in file_list: n_req = 0 ifile_path = "{}/{}".format(in_dir, ifile_name) file_idx = int(ifile_name.split(".")[-1]) if file_idx < finished_log_idx: logging.info("skip {}".format(ifile_path)) trace_reader = TraceReaderHDFS(ifile_path, "twr", hdfs_host, hdfs_port, hdfs_user, hdfs_buffer_size=128 * MB) raw_line = trace_reader.read_one_req() while raw_line: line = raw_line.decode() n_req += 1 ts, obj_id, ksize, vsize, client_id, op, namespace, ttl = _parse_log_line( line, last_write_ts, line_no=n_req, ifile_path="{}/{}".format(in_dir, ifile_name)) if last_write_ts == 0: last_write_ts = ts - 1 # gizmoduck start time if task_name == "gizmoduck_cache" or task_name == "wtf_req_cache": if ts < 1585706400: raw_line = trace_reader.read_one_req() continue # # gizmoduck end time # if ts > 1585706400 + 3600 * 24 * 2: # break # friday 4pm if ts < 1585324800: raw_line = trace_reader.read_one_req() continue # next friday 4pm GMT if ts > 1585929600: break req = Req(real_time=ts, obj_id=obj_id, key_size=ksize, value_size=vsize, op=op, ttl=ttl) last_write_ts = buffer_req_to_guarantee_time_monotonic( req, req_buffer, ofile, last_write_ts, 2) raw_line = trace_reader.read_one_req() n_req_total += n_req t1 = time.time() logging.info( "finish converting one file - {} kReq / total {} kReq, {} kobj, dump time {:.2f}s ifile {}" .format(n_req // 1000, n_req_total // 1000, len(obj_id_mapping) // 1000, time.time() - t1, "/".join(ifile_path.split("/")[-4:]))) for ts in sorted(req_buffer.keys()): for req in req_buffer[ts]: _write_to_ofile(req, ofile) logging.info( "****************************** finish all conversion {} kReq, ofile {}" .format(n_req_total // 1000, ofile_path)) with open("conversion.finish", "a") as ofile2: ofile2.write("{}\n".format(in_dir)) ofile.flush() ofile.close() return n_req_total, len(obj_id_mapping)
class HadoopFileSystem(BaseFs.FileSystem): def __init__(self, vcPath, simulateOnly=False, isVerbose=False, logger=None, user=None, host=None, port=None): BaseFs.FileSystem.__init__(self, simulateOnly, isVerbose, logger) config = Config.Config() hdfsUser, hdfsHost, hdfsPort = config.getHadoopConfig(user, host, port) self.hdfs = PyWebHdfsClient(host=hdfsHost, port=hdfsPort, user_name=hdfsUser) self.vcPath = vcPath def make_fd(self, path, isSrc, dstDirMustExist): fd = None try: fd = HadoopFileDescriptor(self, path, isSrc, dstDirMustExist) except pywebhdfs.errors.FileNotFound: self.logger.info("DESC: does not exist: " + path) raise Errors.FileNotFound("Path {0} does not exist".format(path)) except pywebhdfs.errors.Unauthorized as e: self.logger.info("Unauthorized for path {0}: {1}".format(path, e)) raise Errors.Unauthorized( "Unauthorized access to the path {0}: {1}".format(path, e)) except requests.exceptions.RequestException as e: self.logger.info("ConnectionError for path {0}: {1}".format( path, e)) raise Errors.BadConnection( "Connection error while looking for path: {0}, exc={1}".format( path, e)) except pywebhdfs.errors.PyWebHdfsException as e: self.logger.info("PyWebHdfsException for path {0}: {1}".format( path, e)) raise Errors.FsException( "An exception happened while looking for path: {0}, exc={1}". format(path, e)) return fd def exists_file_dir(self, fd): try: return self.hdfs.exists_file_dir(fd.abspath) except pywebhdfs.errors.Unauthorized as e: self.logger.info("Unauthorized for path {0}: {1}".format( fd.abspath, e)) raise Errors.Unauthorized( "Unauthorized access to the path {0}: {1}".format( fd.abspath, e)) except requests.exceptions.RequestException as e: self.logger.info("ConnectionError for path {0}: {1}".format( fd.abspath, e)) raise Errors.BadConnection( "Connection error during HDFS exists test: {0}, exc={1}". format(fd.abspath, e)) except pywebhdfs.errors.PyWebHdfsException as e: self.logger.info("PyWebHdfsException for path {0}: {1}".format( fd.abspath, e)) raise Errors.FsException( "An exception happened during HDFS exists test: {0}, exc={1}". format(fd.abspath, e)) def delete_file_dir(self, fd, recursive=False, force=False): if self.simulateOnly: print("SIMULATE -> remove file/dir: {0}, recursive={1}".format( fd.abspath, recursive)) else: try: if not recursive or force or \ query_yes_no(question="Are you sure you want to delete folder recursively?", default="no"): status = self.hdfs.delete_file_dir(fd.abspath, recursive=recursive) except pywebhdfs.errors.Unauthorized as e: self.logger.info("Unauthorized for path {0}: {1}".format( fd.abspath, e)) raise Errors.Unauthorized( "Unauthorized access to the path {0}: {1}".format( fd.abspath, e)) except requests.exceptions.RequestException as e: self.logger.info("ConnectionError for path {0}: {1}".format( fd.abspath, e)) raise Errors.BadConnection( "Connection error during HDFS delete directory: {0}, exc={1}" .format(fd.abspath, e)) except pywebhdfs.errors.PyWebHdfsException as e: self.logger.info("PyWebHdfsException for path {0}: {1}".format( fd.abspath, e)) raise Errors.FsException( "An exception happened during HDFS delete directory: {0}, exc={1}" .format(fd.abspath, e)) def list_dir(self, fd): try: status = self.hdfs.list_dir(fd.abspath) except pywebhdfs.errors.Unauthorized as e: self.logger.info("Unauthorized for path {0}: {1}".format( fd.abspath, e)) raise Errors.Unauthorized( "Unauthorized access to the path {0}: {1}".format( fd.abspath, e)) except requests.exceptions.RequestException as e: self.logger.info("ConnectionError for path {0}: {1}".format( fd.abspath, e)) raise Errors.BadConnection( "Connection error while looking for path: {0}, exc={1}".format( fd.abspath, e)) except pywebhdfs.errors.PyWebHdfsException as e: self.logger.info("PyWebHdfsException for path {0}: {1}".format( fd.abspath, e)) raise Errors.FsException( "An exception happened while looking for path: {0}, exc={1}". format(fd.abspath, e)) currentDir = status["FileStatuses"]["FileStatus"] for item in currentDir: yield HadoopFileDescriptor(self, fd.abspath, isSrc=True, needsDstDirCheck=False, fileJson=item) def make_dir(self, path): if self.simulateOnly: print("SIMULATE -> make dir: " + path) else: try: self.hdfs.make_dir(path) except pywebhdfs.errors.Unauthorized as e: self.logger.info("Unauthorized for path {0}: {1}".format( path, e)) raise Errors.Unauthorized( "Unauthorized access to the path {0}: {1}".format(path, e)) except requests.exceptions.RequestException as e: self.logger.info("ConnectionError for path {0}: {1}".format( path, e)) raise Errors.BadConnection( "Connection error during HDFS create directory: {0}, exc={1}" .format(path, e)) except pywebhdfs.errors.PyWebHdfsException as e: self.logger.info("PyWebHdfsException for path {0}: {1}".format( path, e)) raise Errors.FsException( "An exception happened during HDFS create directory: {0}, exc={1}" .format(path, e)) def open_file(self, fd, rwMode): return fd def close_file(self, fd): pass def touch_file(self, fd): if self.simulateOnly: print("SIMULATE -> touch file: " + fd.abspath) else: try: self.hdfs.create_file(fd.abspath, 0, overwrite=True) except pywebhdfs.errors.Unauthorized as e: self.logger.info("Unauthorized for path {0}: {1}".format( fd.abspath, e)) raise Errors.Unauthorized( "Unauthorized access to the path {0}: {1}".format( fd.abspath, e)) except requests.exceptions.RequestException as e: self.logger.info("ConnectionError for path {0}: {1}".format( fd.abspath, e)) raise Errors.BadConnection( "Connection error during HDFS create file: {0}, exc={1}". format(fd.abspath, e)) except pywebhdfs.errors.PyWebHdfsException as e: self.logger.info("PyWebHdfsException for path {0}: {1}".format( fd.abspath, e)) raise Errors.FsException( "An exception happened during HDFS create file: {0}, exc={1}" .format(fd.abspath, e)) def truncate_file(self, fd, size): if self.simulateOnly: print("SIMULATE -> truncate file: {0}, size={1}".format( fd.abspath, size)) else: try: self.hdfs.truncate_file(fd.abspath, size) except pywebhdfs.errors.Unauthorized as e: self.logger.info("Unauthorized for path {0}: {1}".format( fd.abspath, e)) raise Errors.Unauthorized( "Unauthorized access to the path {0}: {1}".format( fd.abspath, e)) except requests.exceptions.RequestException as e: self.logger.info("ConnectionError for path {0}: {1}".format( fd.abspath, e)) raise Errors.BadConnection( "Connection error during HDFS truncate file: {0}, exc={1}". format(fd.abspath, e)) except pywebhdfs.errors.PyWebHdfsException as e: self.logger.info("PyWebHdfsException for path {0}: {1}".format( fd.abspath, e)) raise Errors.FsException( "An exception happened during HDFS truncate file: {0}, exc={1}" .format(fd.abspath, e)) def try_concat_files(self, fd, chunkFdList): # Workaround for unordered concat bug in Hadoop 2.7.1 is to use one source at the time # https://issues.apache.org/jira/browse/HDFS-8891 currIndex = 0 concatStep = 20 chunkedList = [ chunkFdList[pos:pos + concatStep] for pos in range(0, len(chunkFdList), concatStep) ] for sourceChunk in chunkedList: try: self.concat_files(fd, sourceChunk) currIndex += len(sourceChunk) except Errors.FsException as e: break return currIndex def concat_files(self, fd, chunkFdList): strList = list() for chunkFd in chunkFdList: strList.append(chunkFd.abspath) if self.simulateOnly: print("SIMULATE -> concat file: {0}, sources={1}".format( fd.abspath, ",".join(strList))) else: try: self.hdfs.concat_files(fd.abspath, strList) except pywebhdfs.errors.Unauthorized as e: self.logger.info("Unauthorized for path {0}: {1}".format( fd.abspath, e)) raise Errors.Unauthorized( "Unauthorized access to the path {0}: {1}".format( fd.abspath, e)) except requests.exceptions.RequestException as e: self.logger.info("ConnectionError for path {0}: {1}".format( fd.abspath, e)) raise Errors.BadConnection( "Connection error during HDFS concat file: {0}, exc={1}". format(fd.abspath, e)) except pywebhdfs.errors.PyWebHdfsException as e: self.logger.info("PyWebHdfsException for path {0}: {1}".format( fd.abspath, e)) raise Errors.FsException( "An exception happened during HDFS concat file: {0}, exc={1}" .format(fd.abspath, e)) def read_data(self, fd, offset, size): if offset >= fd.size: return "" else: try: contents = self.hdfs.read_file(fd.abspath, offset=offset, length=size) except pywebhdfs.errors.Unauthorized as e: self.logger.info("Unauthorized for path {0}: {1}".format( fd.abspath, e)) raise Errors.Unauthorized( "Unauthorized access to the path {0}: {1}".format( fd.abspath, e)) except requests.exceptions.RequestException as e: self.logger.info("ConnectionError for path {0}: {1}".format( fd.abspath, e)) raise Errors.BadConnection( "Connection error during HDFS read file: {0}, exc={1}". format(fd.abspath, e)) except pywebhdfs.errors.PyWebHdfsException as e: self.logger.info("PyWebHdfsException for path {0}: {1}".format( fd.abspath, e)) raise Errors.FsException( "An exception happened during HDFS read file: {0}, exc={1}" .format(fd.abspath, e)) return contents def append_data(self, fd, data): if self.simulateOnly: print("SIMULATE -> write file data: " + fd.abspath) else: try: self.hdfs.append_file(fd.abspath, data) except pywebhdfs.errors.Unauthorized as e: self.logger.info("Unauthorized for path {0}: {1}".format( fd.abspath, e)) raise Errors.Unauthorized( "Unauthorized access to the path {0}: {1}".format( fd.abspath, e)) except requests.exceptions.RequestException as e: self.logger.info("ConnectionError for path {0}: {1}".format( fd.abspath, e)) raise Errors.BadConnection( "Connection error during HDFS append file: {0}, exc={1}". format(fd.abspath, e)) except pywebhdfs.errors.PyWebHdfsException as e: self.logger.info("PyWebHdfsException for path {0}: {1}".format( fd.abspath, e)) raise Errors.FsException( "An exception happened during HDFS append file: {0}, exc={1}" .format(fd.abspath, e)) def local_mv_file(self, src, dst): if self.simulateOnly: print("SIMULATE -> local move file: {0} -> {1} ".format( src.abspath, dst.abspath)) else: try: self.hdfs.rename_file_dir(src.abspath, dst.abspath) except pywebhdfs.errors.Unauthorized as e: self.logger.info("Unauthorized for path {0}: {1}".format( src.abspath, e)) raise Errors.Unauthorized( "Unauthorized access to the path {0}: {1}".format( src.abspath, e)) except requests.exceptions.RequestException as e: self.logger.info("ConnectionError for path {0}: {1}".format( src.abspath, e)) raise Errors.BadConnection( "Connection error during HDFS rename file: {0}, exc={1}". format(src.abspath, e)) except pywebhdfs.errors.PyWebHdfsException as e: self.logger.info("PyWebHdfsException for path {0}: {1}".format( src.abspath, e)) raise Errors.FsException( "An exception happened during HDFS rename file: {0}, exc={1}" .format(src.abspath, e)) def local_cp_file(self, src, dst): # This is an open issue in Hadoop community: https://issues.apache.org/jira/browse/HDFS-3370 # Instead, we can do a symbolic link if self.simulateOnly: print("SIMULATE -> local copy file: {0} -> {1} ".format( src.abspath, dst.abspath)) else: print( "Copy within HDFS is not supported due to lack of Hadoop support" ) print( "Once symbolic links are enabled, this feature will be enabled" ) sys.exit(1) # self.hdfs.create_sym_link(src.abspath, dst.abspath, createParent=True) def get_hdfs_file_dir_json(self, path): try: status = self.hdfs.get_file_dir_status(path) return status["FileStatus"] except pywebhdfs.errors.FileNotFound: return None def validate_hdfs_arg(self, arg): if not arg.startswith(self.vcPath): print("Error: You don't have permissions to the path: %s" % arg) print("Your path must be rooted under: %s" % self.vcPath) sys.exit(1)
class WhenTestingListDirOperation(unittest.TestCase): def setUp(self): self.host = 'hostname' self.port = '00000' self.user_name = 'username' self.webhdfs = PyWebHdfsClient(host=self.host, port=self.port, user_name=self.user_name) self.response = MagicMock() self.requests = MagicMock(return_value=self.response) self.path = 'user/hdfs/old_dir' self.response = MagicMock() self.file_status = { "FileStatuses": { "FileStatus": [ { "accessTime": 0, "blockSize": 0, "group": "supergroup", "length": 24930, "modificationTime": 1320173277227, "owner": "webuser", "pathSuffix": "a.patch", "permission": "777", "replication": 0, "type": "FILE" }, { "accessTime": 0, "blockSize": 0, "group": "supergroup", "length": 0, "modificationTime": 1320173277227, "owner": "webuser", "pathSuffix": "", "permission": "777", "replication": 0, "type": "DIRECTORY" } ] } } self.response.json = MagicMock(return_value=self.file_status) def test_get_status_throws_exception_for_not_ok(self): self.response.status_code = httplib.BAD_REQUEST self.requests.get.return_value = self.response with patch('pywebhdfs.webhdfs.requests', self.requests): with self.assertRaises(errors.PyWebHdfsException): self.webhdfs.list_dir(self.path) def test_get_status_returns_true(self): self.response.status_code = httplib.OK self.requests.get.return_value = self.response with patch('pywebhdfs.webhdfs.requests', self.requests): result = self.webhdfs.list_dir(self.path) for key in result: self.assertEqual(result[key], self.file_status[key])
def upload_to_hdfs(self, local_file, table, index): ''' upload file from local filesystem to hdfs ''' hiveOper = hive_op.HiveOperation() local_dir = self._conf.get('local', 'data_dir') local_path = '{}{}/{}'.format(local_dir, index, local_file) host1 = self._conf.get('hdfs', 'name_node1') host2 = self._conf.get('hdfs', 'name_node2') user = self._conf.get('hdfs', 'user') port = self._conf.getint('hdfs', 'port') hdfs_base_path = self._conf.get('hdfs', 'upload_path') hdfs_dir_path = '{}{}'.format(hdfs_base_path, index) hdfs_path = '{}{}/{}'.format(hdfs_base_path, index, local_file) #implement HA manually try: hdfs_cli = PyWebHdfsClient(host=host1, port=port, user_name=user) hdfs_cli.list_dir('/') except Exception as e: logger.warn('open hdfs client failed error {}'.format(e)) hdfs_cli = PyWebHdfsClient(host=host2, port=port, user_name=user) hdfs_cli.list_dir('/') if hdfs_cli is None: logger.error('no active host') return None try: hdfs_cli.get_file_dir_status(hdfs_path) # 若hdfs中临时文件存在,表示可能是上次上传hive失败,或者进程中途被杀导致 # 先将临时文件中的数据导入hive,再进行下一步操作 ret = hiveOper.load_hdfs_file_into_tmp_table(hdfs_path, table) if ret == -1: logger.error('load from hdfs to tmp table failed') logger.info('last time! {} load into tmp finished'.format(table)) hiveOper.load_tmp_table_to_main(table) logger.info( 'last time! {} load tmp table to main finished'.format(table)) #FileNotFountException except Exception as e: #文件不存在是正常情况 logger.debug('no such file {}'.format(hdfs_path)) retry_count = 0 upload_finished = False while retry_count <= 10 and not upload_finished: with open(local_path) as f: logger.debug('''local path is {}, hdfs_cli is {}, file is {}, hdfs_path is {}'''.format( local_path, hdfs_cli, f, hdfs_path)) #hdfs_cli.delete_file_dir(hdfs_path) #若目录不存在,先创建目录 try: hdfs_cli.get_file_dir_status(hdfs_dir_path) except Exception as e: hdfs_cli.make_dir(hdfs_dir_path) try: hdfs_cli.create_file(hdfs_path, f) upload_finished = True except Exception as e: logger.warn('''create file on hdfs failed, local path is {}, hdfs path is {}, retry count {}, upload flag {}'''.format( local_path, hdfs_path, retry_count, upload_finished)) logger.warn('error is {}'.format(e)) retry_count += 1 if retry_count <= 10: return hdfs_path else: logger.error('''{} upload 10 times, still failed, retry count {}, upload_flag is {}'''.format( local_path, retry_count, upload_finished)) return None
hdfs.append_file(example_file, example_data) file_status = hdfs.get_file_dir_status(example_file) print file_status # read in the data for the file print('reading data from file at: {0}\n'.format(example_file)) file_data = hdfs.read_file(example_file) print file_data # rename the example_dir print('renaming directory from {0} to {1}\n').format(example_dir, rename_dir) hdfs.rename_file_dir(example_dir, '/{0}'.format(rename_dir)) # list the contents of the new directory listdir_stats = hdfs.list_dir(rename_dir) print listdir_stats example_file = '{dir}/example.txt'.format(dir=rename_dir) # delete the example file print('deleting example file at: {0}'.format(example_file)) hdfs.delete_file_dir(example_file) # list the contents of the directory listdir_stats = hdfs.list_dir(rename_dir) print listdir_stats # delete the example directory print('deleting the example directory at: {0}'.format(rename_dir)) hdfs.delete_file_dir(rename_dir, recursive='true')
if __name__ == "__main__": # set up spark env spark = (SparkSession.builder.appName( 'Data collection with hive').enableHiveSupport().getOrCreate()) sc = spark.sparkContext hdfs = PyWebHdfsClient(host='r-9arp1kfy-0.localdomain', port='50070', user_name='nikikiq') data_dir = str(sys.argv[1]) dump_limit = 2000 dir_status = hdfs.list_dir(data_dir) files = get_files(dir_status['FileStatuses']['FileStatus']) if files == None: print('No data available, please try again!') else: # checked logged files and files which may fail due to various errors try: with open('/vdc/team40/nia_test/collected/file_log.txt', 'r') as my_log: logged_files = [line.strip() for line in my_log] my_log.close() except FileNotFoundError: logged_files = []