class SavedModelUploader(object): """upload a saved model to hadoop file system""" def __init__(self, url, user, base_path=""): self._logger = logging.getLogger(self.__class__.__name__) self._url = url self._user_ = user self._base_path = base_path self._client = InsecureClient(url, user) if not self._exist(base_path): self._mkdir(base_path) def _exist(self, path): if self._client.content(path, strict=False): return True else: return False def _mkdir(self, path): self._client.makedirs(path) def _del(self, path): self._client.delete(path, recursive=True) def _upload(self, local_path, hdfs_path): self._client.upload(hdfs_path, local_path) def _logging_progress(self, local_path, nbytes): msg = None if nbytes > 0: msg = "uploading: '{}' [{} bytes]".format(local_path, nbytes) else: msg = "uploading: '{}' [done]".format(local_path) self._logger.info(msg) def upload(self, local_model_path, overwrite=False): hdfs_model_path = self._base_path + '/' + basename(local_model_path) existed = self._exist(hdfs_model_path) if overwrite and existed: self._del(hdfs_model_path) elif not overwrite and existed: raise RuntimeError( "could not overwrite the model, already existed.") try: self._client.upload(self._base_path, local_model_path, progress=self._logging_progress) except HdfsError as e: self._logger.error(e) self._logger.info("model upload done")
def upload_to_hdfs(input_dir, output_dir, chunk_size): # locate files in directory files = [ os.path.abspath("{}/{}".format(input_dir, f)) for f in listdir(input_dir) if isfile(join(input_dir, f)) ] tmp_dir = "{}/tmp".format(input_dir) # setup temp dir if os.path.isdir(tmp_dir): shutil.rmtree(tmp_dir) os.mkdir(tmp_dir) # split files into 128mb chunks for f in files: fs = FileSplit(file=f, splitsize=(chunk_size) * 1e6, output_dir=tmp_dir) fs.split(callback=split_callback) # upload to hdfs hdfs_client = InsecureClient("http://{}:9870".format( settings.HDFS_HOST_VALUE), user=settings.HDFS_USER_VALUE) # delete existing output dir if hdfs_client.content(output_dir, strict=False) != None: hdfs_client.delete(output_dir, recursive=True) # upload files to tmp dir remote_path = hdfs_client.upload(hdfs_path="/tmp", local_path=tmp_dir, n_threads=-1, overwrite=True) # rename to output_dir hdfs_client.rename("/tmp", output_dir) print( "{} files uploaded to hdfs host '{}{}' ({} file chunks total)".format( len(files), settings.HDFS_HOST_VALUE, output_dir, len(split_files), )) # delete temp files shutil.rmtree(tmp_dir) return hdfs_file_paths
def fileExists(server, filename): """ Checks is a file exists in hdfs. Args: server: hdfs server and port, example: "http://hadoop1:50070". filename: path of file to check. Returns: True: if file exists, False otherwise. """ from hdfs import InsecureClient client = InsecureClient(server) return client.content(filename, strict=False) != None
def init_context(context): key_count = defaultdict(int) setattr(context.user_data, "key_count", key_count) # init HDFS hdfs_host = os.environ.get("HDFS_HOST") hdfs_user = os.environ.get("HDFS_USER") output_path = os.environ.get("REDUCER_OUTPUT_FILENAME") hdfs_client = InsecureClient("http://{}:9870".format(hdfs_host), user=hdfs_user) # delete existing output file if hdfs_client.content(output_path, strict=False) != None: hdfs_client.delete(output_path) setattr(context.user_data, "hdfs_client", hdfs_client)
def getPathLength(spath, host=None): from taskOpt.opt_utils import hadoop_host, hadoop_web_ui_port, hadoop_hdfs_port try: if host is None: host = "http://{hadoop_host}:{hadoop_web_ui_port}".format( hadoop_host=hadoop_host, hadoop_web_ui_port=hadoop_web_ui_port) client = InsecureClient(host) spath = spath.split("hdfs://{hadoop_host}:{hadoop_hdfs_port}".format( hadoop_host=hadoop_host, hadoop_hdfs_port=hadoop_hdfs_port))[-1] length = client.content(spath)['length'] assert length is not None return length except Exception as e: print(Fore.RED + "erros occurs when accsing HDFS,check for the host:{host}". format(host=host)) logging.exception(e) raise Exception
# import hdfs library to Python from hdfs import InsecureClient # log in hdfs server client = InsecureClient('http://master32:50070', user='******') # print all of the hdfs root folder print client.list('/') path = '/test/aaa.txt' # Check if the file exists if (client.content(path, strict=False) != None): client.delete(path) print "START TO WRITE FILE" # write a text file from hdfs with client.write(path, encoding='utf-8') as writer: for i in range(10): writer.write("Hello World\n") print "DONE" print "START TO READ FILE" # read a text file from hdfs with client.read(path, chunk_size=8096) as reader: for chunk in reader: print chunk
liste_hello = ['hello1', 'hello2'] liste_world = ['world1', 'world2'] df = pd.DataFrame(data={'hello': liste_hello, 'world': liste_world}) # ==== Writing Dataframe to HDFS ===== with client_hdfs.write('/user/hdfs/wiki/helloworld.csv', encoding='utf-8') as writer: df.to_csv(writer) # ====== Reading files ====== with client_hdfs.read('/user/hdfs/wiki/helloworld.csv', encoding='utf-8') as reader: df = pd.read_csv(reader, index_col=0) # ==== Getting Content Summary ==== client_hdfs.content('hdfs_path') # ==== Remove a directory or File in HDFS ==== client_hdfs.delete('hdfs_path', recursive=False, skip_trash=True) # ==== Create a Directory ==== client_hdfs.makedirs('hdfs_path', permission=None) # ==== Upload FIle into HDFS ==== client_hdfs.upload('hdfs_path', 'local_path', n_threads=1, temp_dir=None, chunk_size=65536, progress=None, cleanup=True,
class MasterCrawler: def __init__(self, url_ckan, redis_ip, redis_port): self.ckan = url_ckan self.r = redis.StrictRedis(host=redis_ip, port=redis_port, db=0) self.client = InsecureClient('http://cdh1:50070/', 'admin', root='/user/admin/open_data') def formatUrl(self, url): urlSplit = url.rsplit('/', 1) urlEnd = urllib.quote(urlSplit[1]) urlStart = urlSplit[0] finalUrl = urlStart + "/" + urlEnd return finalUrl def initializeRedis(self): content = self.client.content('dati_gov/dati_gov.json', strict=False) if not content: with self.client.write('dati_gov/dati_gov.json', encoding='utf-8') as writer: writer.write('') request = urllib2.Request(URL_DATI_GOV + "/api/3/action/package_list") response = urllib2.urlopen(request) assert response.code == 200 response_dict = json.loads(response.read()) # Check the contents of the response. assert response_dict['success'] is True result = response_dict['result'] test_res = result #[:2000] for res in test_res: print res self.r.rpush("dataset_id", res) def consumeData(self): red = self.r while (red.llen("dataset_id") != 0): dataset_id = red.lpop("dataset_id") encRes = urllib.urlencode( {"id": unicode(dataset_id).encode('utf-8')}) request_info = urllib2.Request(URL_DATI_GOV + "/api/3/action/package_show?" + encRes) #request_info.add_header("Authorization", "Basic %s" % base64string) try: response_info = urllib2.urlopen(request_info) info_dataset = json.loads(response_info.read()) results = info_dataset['result'] info = results #print json.dumps(info) if 'resources' in info: #print info info["m_status_resources"] = "ok" resources = info['resources'] name = info['name'] idInfo = info['id'] for resource in resources: rUrl = resource['url'] rFormat = resource['format'] rName = resource['name'] rId = resource['id'] finalUrl = self.formatUrl(rUrl) print finalUrl rInfo = urllib2.Request(finalUrl) try: rReq = urllib2.urlopen(rInfo) if rReq.code == 200: resource["m_status"] = "ok" if "csv" in rFormat.lower(): print "qui passo" data = rReq.read() data_dir = "dati_gov/open_api/csv/" + dataset_id existDir = self.client.content( data_dir, strict=False) if not existDir: self.client.makedirs(data_dir) file_path = data_dir + "/" + rId + ".csv" #with self.client.write(file_path, encoding='utf-8') as writer: with self.client.write( file_path) as writer: writer.write(data) if "json" in rFormat.lower(): data = rReq.read() data_dir = "dati_gov/open_api/json/" + dataset_id existDir = self.client.content( data_dir, strict=False) if not existDir: self.client.makedirs(data_dir) file_path = data_dir + "/" + rId + ".json" # with self.client.write(file_path, encoding='utf-8') as writer: with self.client.write( file_path) as writer: writer.write(data) else: resource["m_status"] = "ko" except Exception, e: resource["m_status"] = "ko" print str(e) else: print info info["m_status_resources"] = "ko" print "NO RESOURCES" with self.client.write('dati_gov/dati_gov.json', encoding='utf-8', append=True) as writer: writer.write(json.dumps(info) + '\n') except Exception, e: print str(e) red.lpush("dataset_error", dataset_id)
# helper to print stats about a file/directory # the prefix is the first printed permission, # a 'd' for directory otherwise '-' def printfile(name, stats, prefix='-'): print(' '.join((\ perms(stats['permission'], prefix),\ ' -' if stats['replication'] is 0 else '%3d' % stats['replication'],\ stats['owner'],\ stats['group'],\ '%10d' % stats['length'],\ datetime.fromtimestamp(stats['modificationTime'] / 1000).strftime('%Y-%m-%d %H:%M'),\ name))) # 1. Make a directory named: /activity1/ content = client.content('/activity1') print(content) # client.makedirs(hdfs_path='/activity1/', permission=None) # client.makedirs(hdfs_path='/activity1/data/', permission=None) # print(client.list(hdfs_path='/')) # print('/activity1/data directory created') # 2. Put the file RandomText.txt into HDFS as the path: /activity1/data/RandomText.txt # client.upload(hdfs_path='/activity1/data/', local_path='/Workspace/cs6500_sp2021_r02_jeon/RandomText.txt') # print('Uploaded') # 3. List the contents of the directory /activity1/data/ # 4. Move the file /activity1/data/RandomText.txt to /activity1/data/NotSoRandomText.txt
def getPathLength(spath, host='http://localhost:50070', user='******'): try: client = InsecureClient(host, user) return client.content(spath)['length'] except: print "erros occurs when accsing HDFS,check for the host:", host, " and the user:", user
# coding: utf-8 """ Copiar el log parseado en la sandbox """ import hdfs from hdfs import InsecureClient # apertura de sesion con WebHDFS client = InsecureClient('http://sandbox.hortonworks.com:50070', user='******', timeout=1000) # estos datos podrían pasarse como parámetros del programa sourceFile = "datos/salida-log_2.txt" destFile = "log_parser_apache" sandboxPath = "/datos/logApache/" dpath = sandboxPath + destFile limite = 1000 content = client.content(sandboxPath) print content # apertura del fichero y preparación del parser con los parámetros que hayamos puestp fo = open(sourceFile, 'r') client.write(dpath, data=fo.read()) fo.close()
# https://hdfscli.readthedocs.io/en/latest/quickstart.html from hdfs import InsecureClient client = InsecureClient(url='http://localhost:9870/', user='******') # from hdfs import Config # client = Config().get_client('dev') # Retrieving a file or folder content summary. content = client.content('/test_datas') # Listing all files inside a directory. fnames = client.list('/test_datas') # Retrieving a file or folder status. # status = client.status('/test_datas/sample1.txt') # Renaming ("moving") a file. # client.rename('/test_datas/sample1.txt', '/test_datas/sample5.txt') # Deleting a file or folder. # client.delete('/test_temps', recursive=True) hdfspath = '/test_datas/' localpath = '/Users/janevallette/Documents/Develops/learn_bigdata/datas/cat.jpeg' result = client.upload(hdfspath, localpath) # Writing part of a file. # with open('datas/upfile.txt') as reader, client.write('/test_datas/upfile1.txt') as writer: # for line in reader: # # if line.startswith('-'): # writer.write(line)
class HDFSLibrary: """ Test library for working with HDFS """ WEB_HDFS_URL = "" client = "" def __init__(self, namenode="localhost", port="50070"): self.WEB_HDFS_URL = 'http://' + namenode + ':' + str(port) print namenode, ">>", port, ">>", self.WEB_HDFS_URL self.client = InsecureClient(self.WEB_HDFS_URL) def check_hdfs_file_exists(self, file_path, stop=False): if None == self.client.status(file_path, strict=False): if stop: print "ERROR: Error: File does not exist: ", file_path return "ERROR: Error: File does not exist: ", file_path # exit(172) return False return True def get_hdfs_file_content(self, file_path): self.check_hdfs_file_exists(file_path, stop=True) data = "" with self.client.read(file_path) as reader: for line in reader: data += line return data def search_string_in_hdfs_file(self, file_path, text1, text2="aqwszx", text3="xzswqa"): ret = self.check_hdfs_file_exists(file_path, stop=True) found = "" if ret else ret with self.client.read(file_path) as reader: for line in reader: if line.find(text1) == -1 and line.find( text2) == -1 and line.find(text3) == -1: continue found += line return found def hdfs_file_should_not_contain(self, file_path, text1, text2="aqwszx", text3="xzswqa"): self.check_hdfs_file_exists(file_path, stop=True) with self.client.read(file_path) as reader: for line in reader: if line.find(text1) != -1 or line.find( text2) != -1 or line.find(text3) != -1: return False return True ######################## # # BASIC FUNCTIONS: # # ######################## def get_hdfs_file_folder_content_summary(self, file_path): """ Retrieving a file or folder content summary. :return: returns a file or folder content summary. """ self.check_hdfs_file_exists(file_path, stop=True) return self.client.content(file_path) def get_hdfs_file_folder_status(self, file_path): """ Retrieving a file or folder status. :return: returns a file or folder status. """ self.check_hdfs_file_exists(file_path, stop=True) return self.client.status(file_path) def list_hdfs_directory(self, folder_path): """ Listing all files inside a directory. :return: returns a file list. """ self.check_hdfs_file_exists(folder_path, stop=True) return self.client.list(folder_path) def move_hdfs_file(self, old_path, new_path): """ Renaming ("moving") a file. :return: NA """ self.check_hdfs_file_exists(old_path, stop=True) self.client.rename(old_path, new_path) def delete_hdfs_file(self, file_path): """ Deleting a file or folder recursively. :return: returns `True` if the deletion was successful otherwise `False` """ self.check_hdfs_file_exists(file_path) return self.client.delete(file_path, recursive=True) def copy_to_local_hdfs_file(self, hdfs_path, local_path): """ Copy a file or folder from HDFS to local. :return: local_path """ self.check_hdfs_file_exists(hdfs_path) return self.client.download(hdfs_path, local_path, overwrite=True, n_threads=4) def copy_from_local_hdfs_file(self, local_path, hdfs_path): """ Copy a file or folder from local to HDFS. :return: hdfs_path """ return self.client.upload(hdfs_path, local_path, overwrite=True, n_threads=4) def get_hdfs_file_checksum(self, file_path): """ Get the checksum value for file :return: checksum """ self.check_hdfs_file_exists(file_path, stop=True) return self.client.checksum(file_path) def create_hdfs_dir(self, dir_path, perm=755): """ Create a directory or recursive dirs on HDFS :return: NA """ self.client.makedirs(dir_path, permission=perm)
# 4. Move the HDFS file /activity1/data/RandomText.txt to /activity1/data/NotSoRandomText.txt client.rename('/activity1/data/RandomText.txt', '/activity1/data/NotSoRandomText.txt') with open('./RandomText.txt', 'r') as f: for line in f: temp = line # 5. Append the local file RandomText.txt to the end of the HDFS file: /activity1/data/NotSoRandomText.txt client.write(hdfs_path='/activity1/data/NotSoRandomText.txt', data=temp, append=True) # 6. List the disk space used by the directory /activity1/data/ diskSpaceUsed = client.content('/activity1/data/', strict=True) print(diskSpaceUsed['spaceConsumed']) # 7. Put the local file MoreRandomText.txt into HDFS as the path: /activity1/data/MoreRandomText.txt client.upload(hdfs_path='/activity1/data/', local_path='./MoreRandomText.txt') print(client.list('/activity1/data')) # 8. Recursively list the contents of the directory /activity1/ fnames = client.list('/activity1') fpaths = [ psp.join(dpath, fname) for dpath, _, fnames in client.walk('/activity1') for fname in fnames ] print(fpaths)
stop = time() print("Stop: " + str(stop)) print("总耗时" + str(stop - start) + "秒") elif model == 'upload_delete': start = time() print("Start: " + str(start)) if False: do_foreach_file('32652(copy)/5104', createtif) upload_tif(client_hdfs) stop = time() print("Stop: " + str(stop)) print("总耗时" + str(stop - start) + "秒") elif model == 'upload_download': #do_foreach_file('32652(copy)/5104', createtif) #client_hdfs.delete('/gf1', recursive = True) if client_hdfs.content('/gf1', False) == None: client_hdfs.upload('/gf1', '32652(copy)') start = time() print("Start: " + str(start)) download_tif(client_hdfs) stop = time() print("Stop: " + str(stop)) print("总耗时" + str(stop - start) + "秒") elif model == 'rows_download': if client_hdfs.content('/gf1', False) == None: client_hdfs.upload('/gf1', '32652(copy)') start = time() print("Start: " + str(start)) rows_download_tif(client_hdfs) stop = time() print("Stop: " + str(stop))
from hdfs import InsecureClient client = InsecureClient('http://manager.novalocal:50070', user='******') # Посмотрим, что у нас есть в рабочей директории print(client.list('/student9_7')) ''' ['cur_readme', 'googlobots.txt', 'py_dir_02', 'readme', 'test', 'test2', 'testdir'] ''' # Посмотрим размер нашей рабочей директории print(client.content('/student9_7')) ''' {'directoryCount': 3, 'fileCount': 5, 'length': 10552, 'quota': -1, 'spaceConsumed': 31637, 'spaceQuota': -1} ''' # Прочитаем файл `test` with client.read('/student9_7/test') as reader: test = reader.read() print(test) ''' b'test file for hdfs\n' ''' # Скопируем файл `test` из хранилища в локальную домашнюю директорию под именем `downloaded_file_via_py3`