def main(): client = Config(path=hdfscliconf).get_client() with client.read('/user/orenault/passwd') as input: #print input.read() df = pd.read_csv(input, sep=':', header=None) cols = df.iloc[:, 0] client.write('/user/orenault/data.avro', cols.to_csv(sep=":", header=True, index=False), overwrite=True)
def main(): arg = parsing_options() client = Config().get_client() with client.read(arg.input) as inputFile: # Load file in dataframe df=pd.read_csv(inputFile, sep=arg.delimiter, header=arg.header) inputFile.closed # Open output file with client.write(arg.output, overwrite=arg.overwrite) as outputFile: # Flatten the list of columns column = list(itertools.chain.from_iterable(arg.column)) # open RSA key key = get_key(arg.RSAkey,arg.operation) # Extract columns which need to be hashed / encrypted cols = df.iloc[:,column] colName = cols.columns if arg.operation == 'decrypt': # Do not forget the comma behind the privateRSA # the correct python grammer for a singleton tuple is (1,) not (1), # which is just an expr wth the value 1. df[colName]=df[colName].apply(decrypt, args=(key,), axis=1) df.to_csv(outputFile, sep=":", header=True, index=False) else: # Encrypt then hash - as otherwise we encrypt the hash value # Call function encrypt w/ RSAkey - Axis=1 for row encrypted = df[colName].apply(encrypt, args=(key,))#, axis=1) # Rename header to not clash when merging df + encrypted data frame new_column=[] #for i in cols.columns: for i in colName: new_column.append(str(i) + '_ENC') encrypted.columns = new_column # Concatenate both dataframe df = pd.concat([df, encrypted], axis=1) # Generate a hash df[colName] = df[colName].apply(hash_value).values # Write to file df.to_csv(outputFile, sep=":", header=True, index=False)
class PendingWindow(object): """docstring for PendingWindow""" def __init__(self, backup_dir, node): # TODO: not cut # each pending window (or node) only has a single downstream cut, # otherwise inconsistency occurs during truncating self.backup_dir = backup_dir self.node = node self.hdfs_client = Config().get_client('dev') self.hdfs_client.makedirs(self.backup_dir) # each backup file is named by the ending version, so the current writing one is named temporarily self.current_backup_path = os.path.join(self.backup_dir, 'current') # touch the file for later appending self.hdfs_client.write(self.current_backup_path, data='') # the version that last truncation conducted against self.safe_version_path = os.path.join(self.backup_dir, 'safe_version') # special case for initial version self.hdfs_client.write(self.safe_version_path, data=str(0)) # the latest integral version self.latest_version_path = os.path.join(self.backup_dir, 'latest_version') # special case for initial version self.hdfs_client.write(self.latest_version_path, data=str(0)) if self.node.type != 'sink': self.version_acks = dict() for n in self.node.downstream_connectors: self.version_acks[n] = 0 def append(self, tuple_): """Make an output tuple persistent, and complete a version if necessary """ self.hdfs_client.write(self.current_backup_path, data=pickle.dumps(tuple_), append=True) if isinstance(tuple_, BarrierTuple): self.hdfs_client.rename( self.current_backup_path, os.path.join(self.backup_dir, str(tuple_.version))) self.hdfs_client.write(self.latest_version_path, data=str(tuple_.version), overwrite=True) self.hdfs_client.write(self.current_backup_path, data='') def extend(self, tuples): # TODO: can be improved with self.hdfs_client.write(self.current_backup_path, append=True) as f: for t in tuples: pickle.dump(t, f) if isinstance(tuples[-1], BarrierTuple): self.hdfs_client.rename( self.current_backup_path, os.path.join(self.backup_dir, str(tuples[-1].version))) self.hdfs_client.write(self.latest_version_path, data=str(tuples[-1].version), overwrite=True) self.hdfs_client.write(self.current_backup_path, data='') def truncate(self, version): """Delete files with filename <= version """ # with self.hdfs_client.read(self.safe_version_path) as f: # safe_version = int(f.read()) # # # only = condition can occur # if version <= safe_version: # return for f in self.hdfs_client.list(self.backup_dir): if f.isdigit() and int(f) <= version: self.hdfs_client.delete(os.path.join(self.backup_dir, f)) # self.node.LOGGER.info('truncated version %d' % version) def handle_version_ack(self, version_ack): old_safe_version = min(self.version_acks.values()) self.version_acks[version_ack.sent_from] = version_ack.version new_safe_version = min(self.version_acks.values()) if new_safe_version > old_safe_version: self.hdfs_client.write(self.safe_version_path, data=str(new_safe_version), overwrite=True) self.truncate(new_safe_version) def get_latest_version(self): with self.hdfs_client.read(self.latest_version_path) as f: latest_version = int(f.read()) return latest_version def rewind(self, version=None): """Delete files with filename > version (including current file) """ if version == None: self.hdfs_client.write(self.current_backup_path, data='', overwrite=True) return # TODO: underflow # assert version == 0 or for f in self.hdfs_client.list(self.backup_dir): if f.isdigit() and int(f) > version: self.hdfs_client.delete(os.path.join(self.backup_dir, f)) self.hdfs_client.write(self.current_backup_path, data='', overwrite=True) self.hdfs_client.write(self.latest_version_path, data=str(version), overwrite=True) def replay(self): """When both the node and pending window state are ready, replay the pending window before resuming """ for v in sorted( map( int, filter(unicode.isdigit, self.hdfs_client.list(self.backup_dir)))): # filter out the faster nodes tuples = [] with self.hdfs_client.read(os.path.join(self.backup_dir, str(v))) as f: while True: try: t = pickle.load(f) tuples.append(t) except EOFError: self.node.LOGGER.debug( 'reached EOF, send this version') break # Spout needs version too, so that data source can resend from a version # except pickle.UnpickleableError: # self.node.LOGGER.debug('spout reached partial dump location, send this incomplete version') # break self.node.multicast(self.node.downstream_nodes, tuples)
# Get the default alias' client. (See the quickstart section in the # documentation to learn more about this.) client = Config().get_client() # Some fake data that we are interested in uploading to HDFS. model = { '(intercept)': 48., 'first_feature': 2., 'second_feature': 12., } # First, we delete any existing `models/` folder on HDFS. client.delete('models', recursive=True) # We can now upload the data, first as CSV. with client.write('models/1.csv', encoding='utf-8') as writer: for item in model.items(): writer.write(u'%s,%s\n' % item) # We can also serialize it to JSON and directly upload it. with client.write('models/1.json', encoding='utf-8') as writer: dump(model, writer) # We can check that the files exist and get their properties. assert client.list('models') == ['1.csv', '1.json'] status = client.status('models/1.csv') content = client.content('models/1.json') # Later, we can download the files back. The `delimiter` option makes it # convenient to read CSV files. with client.read('models/1.csv', delimiter='\n', encoding='utf-8') as reader:
from hdfs import Config, InsecureClient import cPickle as pickle from tuple import Tuple client = Config().get_client('dev') client.write('a/p', 'aaa', overwrite=True) print client.status('a')
class HadoopWebExplorer: def __init__(self, debug=False): path = os.path.join(os.path.dirname(os.path.abspath(__file__)), '.hdfscli.cfg') self.client = Config(path).get_client() self.debug = debug def print(self, *args): if self.debug: print(*args) def path_exists(self, path): """ Checks whether such path already exists :param path: path to check :type path: unicode :return: boolean flag indicating whether path already exists or not :rtype: bool """ return self.client.status(path, strict=False) is not None @catch_hdfs_error def create_folder(self, folder_name): """ Creates folder with the given name if it does not exist :param folder_name: the name of the folder we want to add :type folder_name: unicode :return: returns true if created folder or it already exists, otherwise false :rtype: bool """ if self.path_exists(folder_name): print(f'Folder already exists: {folder_name}') return True self.print(f'Folder does not exist: {folder_name}') self.client.makedirs(folder_name) self.print(f'Folder created: {folder_name}') @catch_hdfs_error def write_to_file(self, folder_name, file_name, data, overwrite=False, append=False): """ Writes provided data into file in the specified folder :param folder_name: name of the folder where file is located :type folder_name: unicode :param file_name: name of the file where data should be written to :type file_name: unicode :param data: data to be written :type data: unicode :param overwrite: overwrite any existing file or directory :type overwrite: bool :param append: append to a file rather than create a new one. :type append: bool :return: returns true if it successfully wrote the data, otherwise false :rtype: bool """ path = os.path.join(folder_name, file_name) if append and not self.path_exists(path): self.client.write(path, data, encoding='utf-8', overwrite=overwrite) else: self.client.write(path, data, encoding='utf-8', overwrite=overwrite, append=append) self.print("Written data to HDFS file") @catch_hdfs_error def read_from_file(self, folder_name, file_name): """ Reads from file in the specified folder :param folder_name: name of the folder where file is located :type folder_name: unicode :param file_name: name of the file where data should be read from :type file_name: unicode """ path = os.path.join(folder_name, file_name) if not self.path_exists(path): self.print(f'File does not exists: {path}') return None return self.client.read(path) @catch_hdfs_error def delete_file(self, folder_name, file_name): """ Deletes file in the specified folder :param folder_name: name of the folder where file is located :type folder_name: unicode :param file_name: name of the file to be deleted :type file_name: unicode :return: returns true if it successfully deleted the file, otherwise false :rtype: bool """ path = os.path.join(folder_name, file_name) return self.client.delete(path) @catch_hdfs_error def delete_folder(self, folder_name): """ Deletes the specified folder :param folder_name: name of the folder where file is located :type folder_name: unicode :return: returns true if it successfully deleted the folder, otherwise false :rtype: bool """ return self.client.delete(folder_name, recursive=True) @catch_hdfs_error def explore_folder(self, folder_name): """ Explores the specified folder :param folder_name: name of the folder to be observed :type folder_name: unicode """ if not self.path_exists(folder_name): self.print(f'Folder does not exists: {folder_name}') self.print(f'Exploring folder: {folder_name}') for path, dirs, files in self.client.walk(folder_name, status=True): for file in files: block_size = file[1]['blockSize'] size = file[1]['length'] owner = file[1]['owner'] self.print( f'\tFile: {file[0]}, blockSize: {block_size}, size: {size}, owner: {owner}' )
class HomuraFS(): def __init__(self): self.client = Config().get_client('dev') self.prompt = 'homura_fs $ ' self.name = None self.local_xml = None self.hdfs_xml = '.last_sync.xml' self.hdfs_loc_xml = None self.mount_root = None #os.getcwd() + '/test' self.hdfs_root = '/cs219' self.meta = HomuraMeta() self.monitor = None if sys.platform.startswith('darwin'): logging.basicConfig(filename='mylog.log', level=logging.INFO) self.monitor = Monitor_Start() def shell_loop(self): while True: cmd = raw_input(self.prompt) if cmd == 'sync': print "Current devices attached:" id_mapping = dict() count = 1 if len(self.monitor.devs) == 0: print "No device attached" continue for dev in self.monitor.devs: #print dev devname = dev['Dname'] manufacture = dev['Man'] hname = dev['Hname'] id_mapping[count] = dev print "{}) Dname: {}, Hname: {}, Manufacture: {}.\n".format( count, devname, hname, manufacture) count += 1 dev_id = int(raw_input("Which device to sync:\n")) if dev_id == 0: continue if dev_id in id_mapping: #self.name = id_mapping[dev_id]['UID'] self.name = '' self.mount_root = id_mapping[dev_id]['Path'] self.local_xml = self.mount_root + '/.last_sync.xml' self.hdfs_loc_xml = self.mount_root + '/.cur_hdfs.xml' self.meta.myRootpath = self.mount_root log('Mount root is ' + self.mount_root) log('Device xml file is ' + self.local_xml) log('HDFS xml file is ' + self.hdfs_xml) log('Copy of HDFS xml stored at ' + self.hdfs_loc_xml) log('Syncing files for device ' + id_mapping[dev_id]['Dname']) self.sync_files() else: pass elif cmd == 'test': pass #log('Setting up test directory with default config') #self.__test() elif cmd == 'download': pass elif cmd == 'quit': if self.monitor: Monitor_Stop(self.monitor) return def download_all(self): log('Downloading all files from HDFS to local device') try: self.create_file(self.mount_root, self.hdfs_root, 1) for dir_or_file in os.listdir(self.mount_root + self.hdfs_root): if not dir_or_file.startswith('.'): shutil.move( self.mount_root + self.hdfs_root + '/' + dir_or_file, self.mount_root) shutil.rmtree(self.mount_root + self.hdfs_root) except: log('Something went wrog while downloading files') try: shutil.rmtree(self.mount_root + self.hdfs_root) except: pass self.meta.path2Xml(self.mount_root) self.meta.saveXml(self.local_xml, Xml='temp') def upload_all(self): log('Uploading all files from local device to HDFS') for dir_or_file in os.listdir(self.mount_root): if not dir_or_file.startswith('.'): try: log('Uploading to ' + self.hdfs_root + '/' + dir_or_file) self.client.upload(self.hdfs_root + '/' + dir_or_file, self.mount_root + '/' + dir_or_file, n_threads=0) except: log('Warning: could not upload') def load_HDFS_XML(self): log("Attempting to fetch HDFS xml") self.update_file(self.hdfs_loc_xml, self.hdfs_xml, 1) log("Loading HDFS xml") self.meta.loadHDFSXml(self.hdfs_loc_xml) os.remove(self.hdfs_loc_xml) def sync_files(self): # check if we have an old snapshot xml if not os.path.isfile( self.local_xml ): # snapshot doesn't exist, so download everything log("No local snapshot file was found at " + self.local_xml) self.meta.Snapshotdoc = self.meta.emptyXml() # use empty try: # fetch HDFS xml and store locally self.load_HDFS_XML() except: self.meta.HDFSdoc = self.meta.emptyXml() else: log("Fetching local snapshot xml from " + self.local_xml) self.meta.loadSnapshotXml(self.local_xml) try: # fetch HDFS xml and store locally self.load_HDFS_XML() except: self.meta.HDFSdoc = self.meta.emptyXml() self.meta.path2Xml(self.mount_root) self.meta.mydoc = self.meta.tempdoc #print 'HDFS XML:' #self.meta.showHDFSXml() #print '---\nSnapshot Xml' #self.meta.showSnapshotXml() #print '---\nLocal Xml' #self.meta.showMyXml() # find operations since last sync (my_creates, my_deletes, my_modifies, hdfs_creates, hdfs_deletes, hdfs_modifies) = self.meta.getOperations() root = self.mount_root name = self.hdfs_root # apply operations on current device for path in my_creates: if path.endswith('/'): # path is a folder we want to create os.makedirs(root + path) else: self.create_file(root + path, name + path, 1) for path in my_modifies: self.update_file(root + path, name + path, 1) for path in my_deletes: self.delete_file(root + path, 1) # apply operations on HDFS for path in hdfs_creates: if path.endswith('/'): # path is a folder we want to create self.client.makedirs(name + path) else: self.create_file(root + path, name + path, 0) for path in hdfs_modifies: self.update_file(root + path, name + path, 0) for path in hdfs_deletes: self.delete_file(name + path, 0) # update last sync for both HDFS and current device self.meta.path2Xml(self.mount_root) self.meta.saveXml(self.local_xml, Xml='temp') self.update_file(self.local_xml, self.hdfs_xml, 0) return # in this set of functions, when kyuubey = 0, the operation goes # from loc to hdfs (i.e. local becomes the "master") # when kyuubey = 1, the operation goes from hdfs to loc # (i.e. hdfs becomes the "master") def create_file(self, loc_path, hdfs_path, kyuubey): if kyuubey == 0: log('Creating ' + hdfs_path + ' on HDFS') self.client.upload(hdfs_path, loc_path, n_threads=0) elif kyuubey == 1: log('Creating ' + loc_path + ' locally') self.client.download(hdfs_path, loc_path, n_threads=0) def update_file(self, loc_path, hdfs_path, kyuubey): if kyuubey == 0: # updating file on HDFS log('Updating file ' + hdfs_path + ' on HDFS') with open(loc_path) as reader: with self.client.write(hdfs_path, overwrite=True) as writer: for line in reader: writer.write(line) elif kyuubey == 1: log('Updating file ' + loc_path + ' locally') with open(loc_path, 'w') as writer: with self.client.read(hdfs_path) as reader: data = reader.read() writer.write(data) def delete_file(self, path, kyuubey): if kyuubey == 0: # delete file on HDFS log('Deleting file ' + path + ' from HDFS') self.client.delete(path, recursive=True) elif kyuubey == 1: # delete file locally log('Deleting file ' + path + ' locally') os.remove(path) def move_file(self, src_path, dst_path, kyuubey): if kyuubey == 0: # move file on HDFS log('Moving file from ' + src_path + ' to ' + dst_path + ' on HDFS') self.client.rename(src_path, dst_path) elif kyuubey == 1: # move file locally os.rename(src_path, dst_path) log('Moving file from ' + src_path + ' to ' + dst_path + ' locally') def __test(self, test_no=1): self.__reset_test() if test_no == 1: self.__config_basic() elif test_no == 2: self.__config_outer_empty() def __reset_test(self): root = self.mount_root log('Resetting mount directory') if os.path.exists(root): shutil.rmtree(root) os.makedirs(root) def __config_basic(self): root = self.mount_root log('Config 1: default') with open(root + '/test1.txt', 'w') as writer: writer.write('hi\nthere\n!\n') with open(root + '/test2.txt', 'w') as writer: writer.write('one-liner') with open(root + '/test3.txt', 'w') as writer: writer.write('') os.makedirs(root + '/subdir') with open(root + '/subdir/test1.txt', 'w') as writer: writer.write('a different\ntest1.txt\nfile!\n') os.makedirs(root + '/subdir/subsubdir') with open(root + '/subdir/subsubdir/test1.txt', 'w') as writer: writer.write('yet another different\ntest1.txt\nfile!\n') def __config_outer_empty(self): root = self.mount_root log('Config 2: outer directory empty') os.makedirs(root + '/subdir') with open(root + '/subdir/test1.txt', 'w') as writer: writer.write('a different\ntest1.txt\nfile!\n') os.makedirs(root + '/subdir/subsubdir') with open(root + '/subdir/subsubdir/test1.txt', 'w') as writer: writer.write('yet another different\ntest1.txt\nfile!\n')
def main(): arg = parsing_options() krb_client = Config(path=arg.hdfsConf).get_client() az_conf = read_conf(arg.azureConf) az_client = az_key_vault_connection(az_conf['azure_client_id'], az_conf['azure_client_secret'], az_conf['azure_tenant_id']) az_rsa_key = az_get_rsa_key_info(az_client, az_conf['key_vault'], az_conf['key_name']) column = list(itertools.chain.from_iterable(arg.column)) with krb_client.read(arg.input) as inputFile: with krb_client.write(arg.output, overwrite=arg.overwrite) as outputFile: if arg.operation == 'encrypt': aes_key = generate_aes_key() az_conf['uuid'] = str(uuid.uuid4()) encrypt_and_store_aes_key(az_client, az_conf, az_rsa_key['version'], base64.b64encode(aes_key)) df = pd.read_csv(inputFile, sep=arg.delimiter, header=arg.header, dtype=str, chunksize=10000) num_chunk = 0 for chunk in df: # Generate new column name and hash in place new_column = [] for i in column: new_column.append(str(i) + '_HASH') chunk[new_column] = chunk[column].apply(hash_value) # Encrypt in place chunk[column] = chunk[column].apply(encrypt, args=(aes_key, az_conf['uuid'])) if num_chunk == 0: chunk.to_csv(outputFile, sep=arg.delimiter, header=True, index=False) num_chunk += 1 else: chunk.to_csv(outputFile, sep=arg.delimiter, header=False, index=False) else: df = pd.read_csv(inputFile, sep=arg.delimiter, header=arg.header, dtype=str, chunksize=1000) num_chunk = 0 for chunk in df: if num_chunk == 0: # spliting only the first column - grabbing the 3rd field (key) and grabbing the value [0] key = base64.b64decode(chunk[column[0]].str.split( pat='-', n=3, expand=True)[3][0]) aes_key = retrieve_and_decrypt_aes_key( az_client, az_conf, az_rsa_key['version'], key) chunk[column] = chunk[column].apply(decrypt, args=(aes_key, )) if num_chunk == 0: chunk.to_csv(outputFile, sep=arg.delimiter, header=True, index=False) num_chunk += 1 else: chunk.to_csv(outputFile, sep=arg.delimiter, header=False, index=False)