Python Config.write Examples

Programming Language: Python

Namespace/Package Name: hdfs

Class/Type: Config

Method/Function: write

Examples at hotexamples.com: 8

Python Config.write - 8 examples found. These are the top rated real world Python examples of hdfs.Config.write extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

Config(20)

read(10)

list(8)

write(8)

delete(5)

status(5)

makedirs(4)

rename(2)

walk(2)

content(1)

download(1)

upload(1)

Example #1

Show file

File: hdfs_ex.py Project: soumya62/col-encrypt

def main():
    client = Config(path=hdfscliconf).get_client()
    with client.read('/user/orenault/passwd') as input:
        #print input.read()
        df = pd.read_csv(input, sep=':', header=None)
        cols = df.iloc[:, 0]
        client.write('/user/orenault/data.avro',
                     cols.to_csv(sep=":", header=True, index=False),
                     overwrite=True)

Example #2

Show file

File: hdfs_columns_encryption_decryption.py Project: soumya62/col-encrypt

def main():
  arg = parsing_options()
  client = Config().get_client()
  with client.read(arg.input) as inputFile:
    # Load file in dataframe
    df=pd.read_csv(inputFile, sep=arg.delimiter, header=arg.header)
  inputFile.closed

  # Open output file
  with client.write(arg.output, overwrite=arg.overwrite) as outputFile:
    
    # Flatten the list of columns
    column = list(itertools.chain.from_iterable(arg.column))
    # open RSA key
    key = get_key(arg.RSAkey,arg.operation)

    # Extract columns which need to be hashed / encrypted
    cols = df.iloc[:,column]
    colName = cols.columns

    if arg.operation == 'decrypt':
      # Do not forget the comma behind the privateRSA
      # the correct python grammer for a singleton tuple is (1,) not (1), 
      # which is just an expr wth the value 1.
      df[colName]=df[colName].apply(decrypt, args=(key,), axis=1)
      df.to_csv(outputFile, sep=":", header=True, index=False)
    else:
      # Encrypt then hash - as otherwise we encrypt the hash value
      # Call function encrypt w/ RSAkey - Axis=1 for row
      encrypted = df[colName].apply(encrypt, args=(key,))#, axis=1)

      # Rename header to not clash when merging df + encrypted data frame
      new_column=[]
      #for i in cols.columns:
      for i in colName:
        new_column.append(str(i) + '_ENC')
      encrypted.columns = new_column
      
      # Concatenate both dataframe
      df = pd.concat([df, encrypted], axis=1)

      # Generate a hash
      df[colName] = df[colName].apply(hash_value).values
      
      # Write to file
      df.to_csv(outputFile, sep=":", header=True, index=False)

Example #3

Show file

class PendingWindow(object):
    """docstring for PendingWindow"""
    def __init__(self, backup_dir, node):
        # TODO: not cut
        # each pending window (or node) only has a single downstream cut,
        # otherwise inconsistency occurs during truncating
        self.backup_dir = backup_dir
        self.node = node

        self.hdfs_client = Config().get_client('dev')

        self.hdfs_client.makedirs(self.backup_dir)

        # each backup file is named by the ending version, so the current writing one is named temporarily
        self.current_backup_path = os.path.join(self.backup_dir, 'current')
        # touch the file for later appending
        self.hdfs_client.write(self.current_backup_path, data='')

        # the version that last truncation conducted against
        self.safe_version_path = os.path.join(self.backup_dir, 'safe_version')
        # special case for initial version
        self.hdfs_client.write(self.safe_version_path, data=str(0))

        # the latest integral version
        self.latest_version_path = os.path.join(self.backup_dir,
                                                'latest_version')
        # special case for initial version
        self.hdfs_client.write(self.latest_version_path, data=str(0))

        if self.node.type != 'sink':
            self.version_acks = dict()
            for n in self.node.downstream_connectors:
                self.version_acks[n] = 0

    def append(self, tuple_):
        """Make an output tuple persistent, and complete a version if necessary
        """

        self.hdfs_client.write(self.current_backup_path,
                               data=pickle.dumps(tuple_),
                               append=True)

        if isinstance(tuple_, BarrierTuple):
            self.hdfs_client.rename(
                self.current_backup_path,
                os.path.join(self.backup_dir, str(tuple_.version)))
            self.hdfs_client.write(self.latest_version_path,
                                   data=str(tuple_.version),
                                   overwrite=True)
            self.hdfs_client.write(self.current_backup_path, data='')

    def extend(self, tuples):
        # TODO: can be improved
        with self.hdfs_client.write(self.current_backup_path,
                                    append=True) as f:
            for t in tuples:
                pickle.dump(t, f)

        if isinstance(tuples[-1], BarrierTuple):
            self.hdfs_client.rename(
                self.current_backup_path,
                os.path.join(self.backup_dir, str(tuples[-1].version)))
            self.hdfs_client.write(self.latest_version_path,
                                   data=str(tuples[-1].version),
                                   overwrite=True)
            self.hdfs_client.write(self.current_backup_path, data='')

    def truncate(self, version):
        """Delete files with filename <= version
        """
        # with self.hdfs_client.read(self.safe_version_path) as f:
        #     safe_version = int(f.read())
        #
        # # only = condition can occur
        # if version <= safe_version:
        #     return

        for f in self.hdfs_client.list(self.backup_dir):
            if f.isdigit() and int(f) <= version:
                self.hdfs_client.delete(os.path.join(self.backup_dir, f))

        # self.node.LOGGER.info('truncated version %d' % version)

    def handle_version_ack(self, version_ack):
        old_safe_version = min(self.version_acks.values())
        self.version_acks[version_ack.sent_from] = version_ack.version
        new_safe_version = min(self.version_acks.values())

        if new_safe_version > old_safe_version:
            self.hdfs_client.write(self.safe_version_path,
                                   data=str(new_safe_version),
                                   overwrite=True)
            self.truncate(new_safe_version)

    def get_latest_version(self):
        with self.hdfs_client.read(self.latest_version_path) as f:
            latest_version = int(f.read())
        return latest_version

    def rewind(self, version=None):
        """Delete files with filename > version (including current file)
        """

        if version == None:
            self.hdfs_client.write(self.current_backup_path,
                                   data='',
                                   overwrite=True)
            return

        # TODO: underflow
        # assert version == 0 or
        for f in self.hdfs_client.list(self.backup_dir):
            if f.isdigit() and int(f) > version:
                self.hdfs_client.delete(os.path.join(self.backup_dir, f))

        self.hdfs_client.write(self.current_backup_path,
                               data='',
                               overwrite=True)

        self.hdfs_client.write(self.latest_version_path,
                               data=str(version),
                               overwrite=True)

    def replay(self):
        """When both the node and pending window state are ready, replay the pending window before resuming
        """

        for v in sorted(
                map(
                    int,
                    filter(unicode.isdigit,
                           self.hdfs_client.list(self.backup_dir)))):
            # filter out the faster nodes
            tuples = []
            with self.hdfs_client.read(os.path.join(self.backup_dir,
                                                    str(v))) as f:
                while True:
                    try:
                        t = pickle.load(f)
                        tuples.append(t)
                    except EOFError:
                        self.node.LOGGER.debug(
                            'reached EOF, send this version')
                        break
                    # Spout needs version too, so that data source can resend from a version
                    # except pickle.UnpickleableError:
                    #     self.node.LOGGER.debug('spout reached partial dump location, send this incomplete version')
                    #     break
                self.node.multicast(self.node.downstream_nodes, tuples)

Example #4

Show file

# Get the default alias' client. (See the quickstart section in the
# documentation to learn more about this.)
client = Config().get_client()

# Some fake data that we are interested in uploading to HDFS.
model = {
  '(intercept)': 48.,
  'first_feature': 2.,
  'second_feature': 12.,
}

# First, we delete any existing `models/` folder on HDFS.
client.delete('models', recursive=True)

# We can now upload the data, first as CSV.
with client.write('models/1.csv', encoding='utf-8') as writer:
  for item in model.items():
    writer.write(u'%s,%s\n' % item)

# We can also serialize it to JSON and directly upload it.
with client.write('models/1.json', encoding='utf-8') as writer:
  dump(model, writer)

# We can check that the files exist and get their properties.
assert client.list('models') == ['1.csv', '1.json']
status = client.status('models/1.csv')
content = client.content('models/1.json')

# Later, we can download the files back. The `delimiter` option makes it
# convenient to read CSV files.
with client.read('models/1.csv', delimiter='\n', encoding='utf-8') as reader:

Example #5

Show file

from hdfs import Config, InsecureClient
import cPickle as pickle
from tuple import Tuple

client = Config().get_client('dev')
client.write('a/p', 'aaa', overwrite=True)
print client.status('a')

Example #6

Show file

class HadoopWebExplorer:
    def __init__(self, debug=False):
        path = os.path.join(os.path.dirname(os.path.abspath(__file__)),
                            '.hdfscli.cfg')
        self.client = Config(path).get_client()
        self.debug = debug

    def print(self, *args):
        if self.debug:
            print(*args)

    def path_exists(self, path):
        """
        Checks whether such path already exists
        :param path: path to check
        :type path: unicode
        :return: boolean flag indicating whether path already exists or not
        :rtype: bool
        """
        return self.client.status(path, strict=False) is not None

    @catch_hdfs_error
    def create_folder(self, folder_name):
        """
        Creates folder with the given name if it does not exist
        :param folder_name: the name of the folder we want to add
        :type folder_name: unicode
        :return: returns true if created folder or it already exists, otherwise false
        :rtype: bool
        """
        if self.path_exists(folder_name):
            print(f'Folder already exists: {folder_name}')
            return True

        self.print(f'Folder does not exist: {folder_name}')
        self.client.makedirs(folder_name)
        self.print(f'Folder created: {folder_name}')

    @catch_hdfs_error
    def write_to_file(self,
                      folder_name,
                      file_name,
                      data,
                      overwrite=False,
                      append=False):
        """
        Writes provided data into file in the specified folder
        :param folder_name: name of the folder where file is located
        :type folder_name: unicode
        :param file_name: name of the file where data should be written to
        :type file_name: unicode
        :param data: data to be written
        :type data: unicode
        :param overwrite: overwrite any existing file or directory
        :type overwrite: bool
        :param append: append to a file rather than create a new one.
        :type append: bool
        :return: returns true if it successfully wrote the data, otherwise false
        :rtype: bool
        """
        path = os.path.join(folder_name, file_name)
        if append and not self.path_exists(path):
            self.client.write(path,
                              data,
                              encoding='utf-8',
                              overwrite=overwrite)
        else:
            self.client.write(path,
                              data,
                              encoding='utf-8',
                              overwrite=overwrite,
                              append=append)
        self.print("Written data to HDFS file")

    @catch_hdfs_error
    def read_from_file(self, folder_name, file_name):
        """
        Reads from file in the specified folder
        :param folder_name: name of the folder where file is located
        :type folder_name: unicode
        :param file_name: name of the file where data should be read from
        :type file_name: unicode
        """
        path = os.path.join(folder_name, file_name)
        if not self.path_exists(path):
            self.print(f'File does not exists: {path}')
            return None
        return self.client.read(path)

    @catch_hdfs_error
    def delete_file(self, folder_name, file_name):
        """
        Deletes file in the specified folder
        :param folder_name: name of the folder where file is located
        :type folder_name: unicode
        :param file_name: name of the file to be deleted
        :type file_name: unicode
        :return: returns true if it successfully deleted the file, otherwise false
        :rtype: bool
        """
        path = os.path.join(folder_name, file_name)
        return self.client.delete(path)

    @catch_hdfs_error
    def delete_folder(self, folder_name):
        """
        Deletes the specified folder
        :param folder_name: name of the folder where file is located
        :type folder_name: unicode
        :return: returns true if it successfully deleted the folder, otherwise false
        :rtype: bool
        """
        return self.client.delete(folder_name, recursive=True)

    @catch_hdfs_error
    def explore_folder(self, folder_name):
        """
        Explores the specified folder
        :param folder_name: name of the folder to be observed
        :type folder_name: unicode
        """
        if not self.path_exists(folder_name):
            self.print(f'Folder does not exists: {folder_name}')
        self.print(f'Exploring folder: {folder_name}')
        for path, dirs, files in self.client.walk(folder_name, status=True):
            for file in files:
                block_size = file[1]['blockSize']
                size = file[1]['length']
                owner = file[1]['owner']
                self.print(
                    f'\tFile: {file[0]}, blockSize: {block_size}, size: {size}, owner: {owner}'
                )

Example #7

Show file

File: homura_hdfs.py Project: HaoLovesCoding/cs219_PersonalCloud

class HomuraFS():
    def __init__(self):
        self.client = Config().get_client('dev')
        self.prompt = 'homura_fs $ '
        self.name = None
        self.local_xml = None
        self.hdfs_xml = '.last_sync.xml'
        self.hdfs_loc_xml = None
        self.mount_root = None  #os.getcwd() + '/test'
        self.hdfs_root = '/cs219'
        self.meta = HomuraMeta()
        self.monitor = None
        if sys.platform.startswith('darwin'):
            logging.basicConfig(filename='mylog.log', level=logging.INFO)
            self.monitor = Monitor_Start()

    def shell_loop(self):
        while True:
            cmd = raw_input(self.prompt)

            if cmd == 'sync':
                print "Current devices attached:"
                id_mapping = dict()
                count = 1

                if len(self.monitor.devs) == 0:
                    print "No device attached"
                    continue
                for dev in self.monitor.devs:
                    #print dev
                    devname = dev['Dname']
                    manufacture = dev['Man']
                    hname = dev['Hname']
                    id_mapping[count] = dev
                    print "{}) Dname: {}, Hname: {}, Manufacture: {}.\n".format(
                        count, devname, hname, manufacture)
                    count += 1
                dev_id = int(raw_input("Which device to sync:\n"))

                if dev_id == 0:
                    continue

                if dev_id in id_mapping:
                    #self.name = id_mapping[dev_id]['UID']
                    self.name = ''
                    self.mount_root = id_mapping[dev_id]['Path']
                    self.local_xml = self.mount_root + '/.last_sync.xml'
                    self.hdfs_loc_xml = self.mount_root + '/.cur_hdfs.xml'
                    self.meta.myRootpath = self.mount_root

                    log('Mount root is ' + self.mount_root)
                    log('Device xml file is ' + self.local_xml)
                    log('HDFS xml file is ' + self.hdfs_xml)
                    log('Copy of HDFS xml stored at ' + self.hdfs_loc_xml)
                    log('Syncing files for device ' +
                        id_mapping[dev_id]['Dname'])
                    self.sync_files()
                else:
                    pass

            elif cmd == 'test':
                pass
                #log('Setting up test directory with default config')
                #self.__test()
            elif cmd == 'download':
                pass
            elif cmd == 'quit':
                if self.monitor:
                    Monitor_Stop(self.monitor)
                return

    def download_all(self):
        log('Downloading all files from HDFS to local device')
        try:
            self.create_file(self.mount_root, self.hdfs_root, 1)
            for dir_or_file in os.listdir(self.mount_root + self.hdfs_root):
                if not dir_or_file.startswith('.'):
                    shutil.move(
                        self.mount_root + self.hdfs_root + '/' + dir_or_file,
                        self.mount_root)
            shutil.rmtree(self.mount_root + self.hdfs_root)
        except:
            log('Something went wrog while downloading files')
            try:
                shutil.rmtree(self.mount_root + self.hdfs_root)
            except:
                pass

        self.meta.path2Xml(self.mount_root)
        self.meta.saveXml(self.local_xml, Xml='temp')

    def upload_all(self):
        log('Uploading all files from local device to HDFS')

        for dir_or_file in os.listdir(self.mount_root):
            if not dir_or_file.startswith('.'):
                try:
                    log('Uploading to ' + self.hdfs_root + '/' + dir_or_file)
                    self.client.upload(self.hdfs_root + '/' + dir_or_file,
                                       self.mount_root + '/' + dir_or_file,
                                       n_threads=0)
                except:
                    log('Warning: could not upload')

    def load_HDFS_XML(self):
        log("Attempting to fetch HDFS xml")
        self.update_file(self.hdfs_loc_xml, self.hdfs_xml, 1)
        log("Loading HDFS xml")
        self.meta.loadHDFSXml(self.hdfs_loc_xml)
        os.remove(self.hdfs_loc_xml)

    def sync_files(self):
        # check if we have an old snapshot xml
        if not os.path.isfile(
                self.local_xml
        ):  # snapshot doesn't exist, so download everything
            log("No local snapshot file was found at " + self.local_xml)
            self.meta.Snapshotdoc = self.meta.emptyXml()  # use empty
            try:
                # fetch HDFS xml and store locally
                self.load_HDFS_XML()

            except:
                self.meta.HDFSdoc = self.meta.emptyXml()

        else:
            log("Fetching local snapshot xml from " + self.local_xml)
            self.meta.loadSnapshotXml(self.local_xml)

            try:
                # fetch HDFS xml and store locally
                self.load_HDFS_XML()
            except:
                self.meta.HDFSdoc = self.meta.emptyXml()

        self.meta.path2Xml(self.mount_root)
        self.meta.mydoc = self.meta.tempdoc

        #print 'HDFS XML:'
        #self.meta.showHDFSXml()
        #print '---\nSnapshot Xml'
        #self.meta.showSnapshotXml()
        #print '---\nLocal Xml'
        #self.meta.showMyXml()

        # find operations since last sync
        (my_creates, my_deletes, my_modifies, hdfs_creates, hdfs_deletes,
         hdfs_modifies) = self.meta.getOperations()

        root = self.mount_root
        name = self.hdfs_root

        # apply operations on current device
        for path in my_creates:
            if path.endswith('/'):  # path is a folder we want to create
                os.makedirs(root + path)
            else:
                self.create_file(root + path, name + path, 1)
        for path in my_modifies:
            self.update_file(root + path, name + path, 1)
        for path in my_deletes:
            self.delete_file(root + path, 1)

        # apply operations on HDFS
        for path in hdfs_creates:
            if path.endswith('/'):  # path is a folder we want to create
                self.client.makedirs(name + path)
            else:
                self.create_file(root + path, name + path, 0)
        for path in hdfs_modifies:
            self.update_file(root + path, name + path, 0)
        for path in hdfs_deletes:
            self.delete_file(name + path, 0)

        # update last sync for both HDFS and current device
        self.meta.path2Xml(self.mount_root)
        self.meta.saveXml(self.local_xml, Xml='temp')
        self.update_file(self.local_xml, self.hdfs_xml, 0)

        return

    # in this set of functions, when kyuubey = 0, the operation goes
    # from loc to hdfs (i.e. local becomes the "master")
    # when kyuubey = 1, the operation goes from hdfs to loc
    # (i.e. hdfs becomes the "master")
    def create_file(self, loc_path, hdfs_path, kyuubey):
        if kyuubey == 0:
            log('Creating ' + hdfs_path + ' on HDFS')
            self.client.upload(hdfs_path, loc_path, n_threads=0)
        elif kyuubey == 1:
            log('Creating ' + loc_path + ' locally')
            self.client.download(hdfs_path, loc_path, n_threads=0)

    def update_file(self, loc_path, hdfs_path, kyuubey):
        if kyuubey == 0:  # updating file on HDFS
            log('Updating file ' + hdfs_path + ' on HDFS')
            with open(loc_path) as reader:
                with self.client.write(hdfs_path, overwrite=True) as writer:
                    for line in reader:
                        writer.write(line)
        elif kyuubey == 1:
            log('Updating file ' + loc_path + ' locally')
            with open(loc_path, 'w') as writer:
                with self.client.read(hdfs_path) as reader:
                    data = reader.read()
                    writer.write(data)

    def delete_file(self, path, kyuubey):
        if kyuubey == 0:  # delete file on HDFS
            log('Deleting file ' + path + ' from HDFS')
            self.client.delete(path, recursive=True)
        elif kyuubey == 1:  # delete file locally
            log('Deleting file ' + path + ' locally')
            os.remove(path)

    def move_file(self, src_path, dst_path, kyuubey):
        if kyuubey == 0:  # move file on HDFS
            log('Moving file from ' + src_path + ' to ' + dst_path +
                ' on HDFS')
            self.client.rename(src_path, dst_path)
        elif kyuubey == 1:  # move file locally
            os.rename(src_path, dst_path)
            log('Moving file from ' + src_path + ' to ' + dst_path +
                ' locally')

    def __test(self, test_no=1):
        self.__reset_test()
        if test_no == 1:
            self.__config_basic()
        elif test_no == 2:
            self.__config_outer_empty()

    def __reset_test(self):
        root = self.mount_root
        log('Resetting mount directory')
        if os.path.exists(root):
            shutil.rmtree(root)
        os.makedirs(root)

    def __config_basic(self):
        root = self.mount_root
        log('Config 1: default')
        with open(root + '/test1.txt', 'w') as writer:
            writer.write('hi\nthere\n!\n')
        with open(root + '/test2.txt', 'w') as writer:
            writer.write('one-liner')
        with open(root + '/test3.txt', 'w') as writer:
            writer.write('')
        os.makedirs(root + '/subdir')
        with open(root + '/subdir/test1.txt', 'w') as writer:
            writer.write('a different\ntest1.txt\nfile!\n')
        os.makedirs(root + '/subdir/subsubdir')
        with open(root + '/subdir/subsubdir/test1.txt', 'w') as writer:
            writer.write('yet another different\ntest1.txt\nfile!\n')

    def __config_outer_empty(self):
        root = self.mount_root
        log('Config 2: outer directory empty')
        os.makedirs(root + '/subdir')
        with open(root + '/subdir/test1.txt', 'w') as writer:
            writer.write('a different\ntest1.txt\nfile!\n')
        os.makedirs(root + '/subdir/subsubdir')
        with open(root + '/subdir/subsubdir/test1.txt', 'w') as writer:
            writer.write('yet another different\ntest1.txt\nfile!\n')

Example #8

Show file

def main():
    arg = parsing_options()
    krb_client = Config(path=arg.hdfsConf).get_client()
    az_conf = read_conf(arg.azureConf)
    az_client = az_key_vault_connection(az_conf['azure_client_id'],
                                        az_conf['azure_client_secret'],
                                        az_conf['azure_tenant_id'])
    az_rsa_key = az_get_rsa_key_info(az_client, az_conf['key_vault'],
                                     az_conf['key_name'])
    column = list(itertools.chain.from_iterable(arg.column))
    with krb_client.read(arg.input) as inputFile:
        with krb_client.write(arg.output,
                              overwrite=arg.overwrite) as outputFile:
            if arg.operation == 'encrypt':
                aes_key = generate_aes_key()
                az_conf['uuid'] = str(uuid.uuid4())
                encrypt_and_store_aes_key(az_client, az_conf,
                                          az_rsa_key['version'],
                                          base64.b64encode(aes_key))
                df = pd.read_csv(inputFile,
                                 sep=arg.delimiter,
                                 header=arg.header,
                                 dtype=str,
                                 chunksize=10000)
                num_chunk = 0
                for chunk in df:
                    # Generate new column name and hash in place
                    new_column = []
                    for i in column:
                        new_column.append(str(i) + '_HASH')
                    chunk[new_column] = chunk[column].apply(hash_value)
                    # Encrypt in place
                    chunk[column] = chunk[column].apply(encrypt,
                                                        args=(aes_key,
                                                              az_conf['uuid']))
                    if num_chunk == 0:
                        chunk.to_csv(outputFile,
                                     sep=arg.delimiter,
                                     header=True,
                                     index=False)
                        num_chunk += 1
                    else:
                        chunk.to_csv(outputFile,
                                     sep=arg.delimiter,
                                     header=False,
                                     index=False)
            else:
                df = pd.read_csv(inputFile,
                                 sep=arg.delimiter,
                                 header=arg.header,
                                 dtype=str,
                                 chunksize=1000)
                num_chunk = 0
                for chunk in df:
                    if num_chunk == 0:
                        # spliting only the first column - grabbing the 3rd field (key) and grabbing the value [0]
                        key = base64.b64decode(chunk[column[0]].str.split(
                            pat='-', n=3, expand=True)[3][0])
                        aes_key = retrieve_and_decrypt_aes_key(
                            az_client, az_conf, az_rsa_key['version'], key)
                    chunk[column] = chunk[column].apply(decrypt,
                                                        args=(aes_key, ))
                    if num_chunk == 0:
                        chunk.to_csv(outputFile,
                                     sep=arg.delimiter,
                                     header=True,
                                     index=False)
                        num_chunk += 1
                    else:
                        chunk.to_csv(outputFile,
                                     sep=arg.delimiter,
                                     header=False,
                                     index=False)