'first_feature': 2., 'second_feature': 12., } # First, we delete any existing `models/` folder on HDFS. client.delete('models', recursive=True) # We can now upload the data, first as CSV. with client.write('models/1.csv', encoding='utf-8') as writer: for item in model.items(): writer.write(u'%s,%s\n' % item) # We can also serialize it to JSON and directly upload it. with client.write('models/1.json', encoding='utf-8') as writer: dump(model, writer) # We can check that the files exist and get their properties. assert client.list('models') == ['1.csv', '1.json'] status = client.status('models/1.csv') content = client.content('models/1.json') # Later, we can download the files back. The `delimiter` option makes it # convenient to read CSV files. with client.read('models/1.csv', delimiter='\n', encoding='utf-8') as reader: items = (line.split(',') for line in reader if line) assert dict((name, float(value)) for name, value in items) == model # Loading JSON directly from HDFS is even simpler. with client.read('models/1.json', encoding='utf-8') as reader: assert load(reader) == model
class SparkHDFSClient(object): def __init__(self, datasource): self.datasource = datasource self.client = Config().get_client("dev") def get_file_list(self, folder): files = self.client.list(folder.strip()) files = [folder + '/' + file for file in files] return files def list_collections(self): results = [] status = self.client.status(self.datasource.url, strict=False) print(status, self.datasource.url) if status is not None: if status['type'] == "DIRECTORY": files = self.get_file_list(self.datasource.url) while len(files) > 0: file = files.pop() status = self.client.status(os.path.join( self.datasource.url, file), strict=False) if status is None: continue if status['type'] == "DIRECTORY": subfiles = self.get_file_list( os.path.join(self.datasource.url, file)) files.extend(subfiles) continue else: if self.datasource.dstype == DataSourceType.SPARK_CSV and file[-2:] != 'sv' \ or self.datasource.dstype == DataSourceType.SPARK_TSV and file[-2:] != 'sv'\ or self.datasource.dstype == DataSourceType.SPARK_XML and file[-3:] != 'xml'\ or self.datasource.dstype == DataSourceType.SPARK_JSON and file[-4:] != 'json': continue row = { "db": file[:file.rfind('/')] if '/' in file else self.datasource.url, "document": file[file.rfind('/') + 1:] if '/' in file else file, "count": -1 } results.append(row) return results else: return [{ "db": self.datasource.url, "document": self.datasource.url, "count": -1 }] else: return results def get_documents(self, filename, limit=10): results = [] delimiter = "\n" header = None rows = 0 if self.datasource.dstype == DataSourceType.SPARK_CSV or \ self.datasource.dstype == DataSourceType.SPARK_TSV: delimiter = "\n" with self.client.read(filename, encoding='utf-8', delimiter=delimiter) as reader: for line in reader: if len(line.strip()) == 0 or line[0] == '#': continue if filename[-3:] == "csv": line = line.split(',') else: line = line.split('\t') if header is None: header = line continue res = { header[i]: line[i] for i in range(len(line)) if i < len(header) } results.append(res) rows += 1 if rows > limit + 1: break elif self.datasource.dstype == DataSourceType.SPARK_XML: with self.client.read(filename, encoding='utf-8', chunk_size=2048) as reader: header = ['content'] for chunk in reader: res = {'content': str(chunk)} results.append(res) print(results) break elif self.datasource.dstype == DataSourceType.SPARK_JSON: with self.client.read(filename, encoding='utf-8') as reader: model = load(reader) if isinstance(model, list): model = [{ p: str(list(md[p][0].keys())) if isinstance(md[p], list) and isinstance(md[p][0], dict) else str(model[p]) if isinstance(md[p], list) else str(list(md[p].keys())) if isinstance(md[p], dict) else md[p] for p in md } for md in model] results.extend(model) else: model = { p: str(list(model[p][0].keys())) if isinstance(model[p], list) and isinstance( model[p][0], dict) else model[p] if isinstance( model[p], list) else str(list(model[p].keys())) if isinstance(model[p], dict) else model[p] for p in model } results.append(model) return results[:limit], limit
class HadoopWebExplorer: def __init__(self, debug=False): path = os.path.join(os.path.dirname(os.path.abspath(__file__)), '.hdfscli.cfg') self.client = Config(path).get_client() self.debug = debug def print(self, *args): if self.debug: print(*args) def path_exists(self, path): """ Checks whether such path already exists :param path: path to check :type path: unicode :return: boolean flag indicating whether path already exists or not :rtype: bool """ return self.client.status(path, strict=False) is not None @catch_hdfs_error def create_folder(self, folder_name): """ Creates folder with the given name if it does not exist :param folder_name: the name of the folder we want to add :type folder_name: unicode :return: returns true if created folder or it already exists, otherwise false :rtype: bool """ if self.path_exists(folder_name): print(f'Folder already exists: {folder_name}') return True self.print(f'Folder does not exist: {folder_name}') self.client.makedirs(folder_name) self.print(f'Folder created: {folder_name}') @catch_hdfs_error def write_to_file(self, folder_name, file_name, data, overwrite=False, append=False): """ Writes provided data into file in the specified folder :param folder_name: name of the folder where file is located :type folder_name: unicode :param file_name: name of the file where data should be written to :type file_name: unicode :param data: data to be written :type data: unicode :param overwrite: overwrite any existing file or directory :type overwrite: bool :param append: append to a file rather than create a new one. :type append: bool :return: returns true if it successfully wrote the data, otherwise false :rtype: bool """ path = os.path.join(folder_name, file_name) if append and not self.path_exists(path): self.client.write(path, data, encoding='utf-8', overwrite=overwrite) else: self.client.write(path, data, encoding='utf-8', overwrite=overwrite, append=append) self.print("Written data to HDFS file") @catch_hdfs_error def read_from_file(self, folder_name, file_name): """ Reads from file in the specified folder :param folder_name: name of the folder where file is located :type folder_name: unicode :param file_name: name of the file where data should be read from :type file_name: unicode """ path = os.path.join(folder_name, file_name) if not self.path_exists(path): self.print(f'File does not exists: {path}') return None return self.client.read(path) @catch_hdfs_error def delete_file(self, folder_name, file_name): """ Deletes file in the specified folder :param folder_name: name of the folder where file is located :type folder_name: unicode :param file_name: name of the file to be deleted :type file_name: unicode :return: returns true if it successfully deleted the file, otherwise false :rtype: bool """ path = os.path.join(folder_name, file_name) return self.client.delete(path) @catch_hdfs_error def delete_folder(self, folder_name): """ Deletes the specified folder :param folder_name: name of the folder where file is located :type folder_name: unicode :return: returns true if it successfully deleted the folder, otherwise false :rtype: bool """ return self.client.delete(folder_name, recursive=True) @catch_hdfs_error def explore_folder(self, folder_name): """ Explores the specified folder :param folder_name: name of the folder to be observed :type folder_name: unicode """ if not self.path_exists(folder_name): self.print(f'Folder does not exists: {folder_name}') self.print(f'Exploring folder: {folder_name}') for path, dirs, files in self.client.walk(folder_name, status=True): for file in files: block_size = file[1]['blockSize'] size = file[1]['length'] owner = file[1]['owner'] self.print( f'\tFile: {file[0]}, blockSize: {block_size}, size: {size}, owner: {owner}' )
from hdfs import Config, InsecureClient import cPickle as pickle from tuple import Tuple client = Config().get_client('dev') client.write('a/p', 'aaa', overwrite=True) print client.status('a')
from hdfs import Config from sys import argv from math import ceil script, filename = argv client = Config().get_client() status = client.status(filename) print(ceil(status['length'] / status['blockSize']))