Example #1
0
def main():
    client = Config(path=hdfscliconf).get_client()
    with client.read('/user/orenault/passwd') as input:
        #print input.read()
        df = pd.read_csv(input, sep=':', header=None)
        cols = df.iloc[:, 0]
        client.write('/user/orenault/data.avro',
                     cols.to_csv(sep=":", header=True, index=False),
                     overwrite=True)
def main():
  arg = parsing_options()
  client = Config().get_client()
  with client.read(arg.input) as inputFile:
    # Load file in dataframe
    df=pd.read_csv(inputFile, sep=arg.delimiter, header=arg.header)
  inputFile.closed

  # Open output file
  with client.write(arg.output, overwrite=arg.overwrite) as outputFile:
    
    # Flatten the list of columns
    column = list(itertools.chain.from_iterable(arg.column))
    # open RSA key
    key = get_key(arg.RSAkey,arg.operation)

    # Extract columns which need to be hashed / encrypted
    cols = df.iloc[:,column]
    colName = cols.columns

    if arg.operation == 'decrypt':
      # Do not forget the comma behind the privateRSA
      # the correct python grammer for a singleton tuple is (1,) not (1), 
      # which is just an expr wth the value 1.
      df[colName]=df[colName].apply(decrypt, args=(key,), axis=1)
      df.to_csv(outputFile, sep=":", header=True, index=False)
    else:
      # Encrypt then hash - as otherwise we encrypt the hash value
      # Call function encrypt w/ RSAkey - Axis=1 for row
      encrypted = df[colName].apply(encrypt, args=(key,))#, axis=1)

      # Rename header to not clash when merging df + encrypted data frame
      new_column=[]
      #for i in cols.columns:
      for i in colName:
        new_column.append(str(i) + '_ENC')
      encrypted.columns = new_column
      
      # Concatenate both dataframe
      df = pd.concat([df, encrypted], axis=1)

      # Generate a hash
      df[colName] = df[colName].apply(hash_value).values
      
      # Write to file
      df.to_csv(outputFile, sep=":", header=True, index=False)
Example #3
0
class PendingWindow(object):
    """docstring for PendingWindow"""
    def __init__(self, backup_dir, node):
        # TODO: not cut
        # each pending window (or node) only has a single downstream cut,
        # otherwise inconsistency occurs during truncating
        self.backup_dir = backup_dir
        self.node = node

        self.hdfs_client = Config().get_client('dev')

        self.hdfs_client.makedirs(self.backup_dir)

        # each backup file is named by the ending version, so the current writing one is named temporarily
        self.current_backup_path = os.path.join(self.backup_dir, 'current')
        # touch the file for later appending
        self.hdfs_client.write(self.current_backup_path, data='')

        # the version that last truncation conducted against
        self.safe_version_path = os.path.join(self.backup_dir, 'safe_version')
        # special case for initial version
        self.hdfs_client.write(self.safe_version_path, data=str(0))

        # the latest integral version
        self.latest_version_path = os.path.join(self.backup_dir,
                                                'latest_version')
        # special case for initial version
        self.hdfs_client.write(self.latest_version_path, data=str(0))

        if self.node.type != 'sink':
            self.version_acks = dict()
            for n in self.node.downstream_connectors:
                self.version_acks[n] = 0

    def append(self, tuple_):
        """Make an output tuple persistent, and complete a version if necessary
        """

        self.hdfs_client.write(self.current_backup_path,
                               data=pickle.dumps(tuple_),
                               append=True)

        if isinstance(tuple_, BarrierTuple):
            self.hdfs_client.rename(
                self.current_backup_path,
                os.path.join(self.backup_dir, str(tuple_.version)))
            self.hdfs_client.write(self.latest_version_path,
                                   data=str(tuple_.version),
                                   overwrite=True)
            self.hdfs_client.write(self.current_backup_path, data='')

    def extend(self, tuples):
        # TODO: can be improved
        with self.hdfs_client.write(self.current_backup_path,
                                    append=True) as f:
            for t in tuples:
                pickle.dump(t, f)

        if isinstance(tuples[-1], BarrierTuple):
            self.hdfs_client.rename(
                self.current_backup_path,
                os.path.join(self.backup_dir, str(tuples[-1].version)))
            self.hdfs_client.write(self.latest_version_path,
                                   data=str(tuples[-1].version),
                                   overwrite=True)
            self.hdfs_client.write(self.current_backup_path, data='')

    def truncate(self, version):
        """Delete files with filename <= version
        """
        # with self.hdfs_client.read(self.safe_version_path) as f:
        #     safe_version = int(f.read())
        #
        # # only = condition can occur
        # if version <= safe_version:
        #     return

        for f in self.hdfs_client.list(self.backup_dir):
            if f.isdigit() and int(f) <= version:
                self.hdfs_client.delete(os.path.join(self.backup_dir, f))

        # self.node.LOGGER.info('truncated version %d' % version)

    def handle_version_ack(self, version_ack):
        old_safe_version = min(self.version_acks.values())
        self.version_acks[version_ack.sent_from] = version_ack.version
        new_safe_version = min(self.version_acks.values())

        if new_safe_version > old_safe_version:
            self.hdfs_client.write(self.safe_version_path,
                                   data=str(new_safe_version),
                                   overwrite=True)
            self.truncate(new_safe_version)

    def get_latest_version(self):
        with self.hdfs_client.read(self.latest_version_path) as f:
            latest_version = int(f.read())
        return latest_version

    def rewind(self, version=None):
        """Delete files with filename > version (including current file)
        """

        if version == None:
            self.hdfs_client.write(self.current_backup_path,
                                   data='',
                                   overwrite=True)
            return

        # TODO: underflow
        # assert version == 0 or
        for f in self.hdfs_client.list(self.backup_dir):
            if f.isdigit() and int(f) > version:
                self.hdfs_client.delete(os.path.join(self.backup_dir, f))

        self.hdfs_client.write(self.current_backup_path,
                               data='',
                               overwrite=True)

        self.hdfs_client.write(self.latest_version_path,
                               data=str(version),
                               overwrite=True)

    def replay(self):
        """When both the node and pending window state are ready, replay the pending window before resuming
        """

        for v in sorted(
                map(
                    int,
                    filter(unicode.isdigit,
                           self.hdfs_client.list(self.backup_dir)))):
            # filter out the faster nodes
            tuples = []
            with self.hdfs_client.read(os.path.join(self.backup_dir,
                                                    str(v))) as f:
                while True:
                    try:
                        t = pickle.load(f)
                        tuples.append(t)
                    except EOFError:
                        self.node.LOGGER.debug(
                            'reached EOF, send this version')
                        break
                    # Spout needs version too, so that data source can resend from a version
                    # except pickle.UnpickleableError:
                    #     self.node.LOGGER.debug('spout reached partial dump location, send this incomplete version')
                    #     break
                self.node.multicast(self.node.downstream_nodes, tuples)
Example #4
0
# Get the default alias' client. (See the quickstart section in the
# documentation to learn more about this.)
client = Config().get_client()

# Some fake data that we are interested in uploading to HDFS.
model = {
  '(intercept)': 48.,
  'first_feature': 2.,
  'second_feature': 12.,
}

# First, we delete any existing `models/` folder on HDFS.
client.delete('models', recursive=True)

# We can now upload the data, first as CSV.
with client.write('models/1.csv', encoding='utf-8') as writer:
  for item in model.items():
    writer.write(u'%s,%s\n' % item)

# We can also serialize it to JSON and directly upload it.
with client.write('models/1.json', encoding='utf-8') as writer:
  dump(model, writer)

# We can check that the files exist and get their properties.
assert client.list('models') == ['1.csv', '1.json']
status = client.status('models/1.csv')
content = client.content('models/1.json')

# Later, we can download the files back. The `delimiter` option makes it
# convenient to read CSV files.
with client.read('models/1.csv', delimiter='\n', encoding='utf-8') as reader:
Example #5
0
from hdfs import Config, InsecureClient
import cPickle as pickle
from tuple import Tuple

client = Config().get_client('dev')
client.write('a/p', 'aaa', overwrite=True)
print client.status('a')
Example #6
0
class HadoopWebExplorer:
    def __init__(self, debug=False):
        path = os.path.join(os.path.dirname(os.path.abspath(__file__)),
                            '.hdfscli.cfg')
        self.client = Config(path).get_client()
        self.debug = debug

    def print(self, *args):
        if self.debug:
            print(*args)

    def path_exists(self, path):
        """
        Checks whether such path already exists
        :param path: path to check
        :type path: unicode
        :return: boolean flag indicating whether path already exists or not
        :rtype: bool
        """
        return self.client.status(path, strict=False) is not None

    @catch_hdfs_error
    def create_folder(self, folder_name):
        """
        Creates folder with the given name if it does not exist
        :param folder_name: the name of the folder we want to add
        :type folder_name: unicode
        :return: returns true if created folder or it already exists, otherwise false
        :rtype: bool
        """
        if self.path_exists(folder_name):
            print(f'Folder already exists: {folder_name}')
            return True

        self.print(f'Folder does not exist: {folder_name}')
        self.client.makedirs(folder_name)
        self.print(f'Folder created: {folder_name}')

    @catch_hdfs_error
    def write_to_file(self,
                      folder_name,
                      file_name,
                      data,
                      overwrite=False,
                      append=False):
        """
        Writes provided data into file in the specified folder
        :param folder_name: name of the folder where file is located
        :type folder_name: unicode
        :param file_name: name of the file where data should be written to
        :type file_name: unicode
        :param data: data to be written
        :type data: unicode
        :param overwrite: overwrite any existing file or directory
        :type overwrite: bool
        :param append: append to a file rather than create a new one.
        :type append: bool
        :return: returns true if it successfully wrote the data, otherwise false
        :rtype: bool
        """
        path = os.path.join(folder_name, file_name)
        if append and not self.path_exists(path):
            self.client.write(path,
                              data,
                              encoding='utf-8',
                              overwrite=overwrite)
        else:
            self.client.write(path,
                              data,
                              encoding='utf-8',
                              overwrite=overwrite,
                              append=append)
        self.print("Written data to HDFS file")

    @catch_hdfs_error
    def read_from_file(self, folder_name, file_name):
        """
        Reads from file in the specified folder
        :param folder_name: name of the folder where file is located
        :type folder_name: unicode
        :param file_name: name of the file where data should be read from
        :type file_name: unicode
        """
        path = os.path.join(folder_name, file_name)
        if not self.path_exists(path):
            self.print(f'File does not exists: {path}')
            return None
        return self.client.read(path)

    @catch_hdfs_error
    def delete_file(self, folder_name, file_name):
        """
        Deletes file in the specified folder
        :param folder_name: name of the folder where file is located
        :type folder_name: unicode
        :param file_name: name of the file to be deleted
        :type file_name: unicode
        :return: returns true if it successfully deleted the file, otherwise false
        :rtype: bool
        """
        path = os.path.join(folder_name, file_name)
        return self.client.delete(path)

    @catch_hdfs_error
    def delete_folder(self, folder_name):
        """
        Deletes the specified folder
        :param folder_name: name of the folder where file is located
        :type folder_name: unicode
        :return: returns true if it successfully deleted the folder, otherwise false
        :rtype: bool
        """
        return self.client.delete(folder_name, recursive=True)

    @catch_hdfs_error
    def explore_folder(self, folder_name):
        """
        Explores the specified folder
        :param folder_name: name of the folder to be observed
        :type folder_name: unicode
        """
        if not self.path_exists(folder_name):
            self.print(f'Folder does not exists: {folder_name}')
        self.print(f'Exploring folder: {folder_name}')
        for path, dirs, files in self.client.walk(folder_name, status=True):
            for file in files:
                block_size = file[1]['blockSize']
                size = file[1]['length']
                owner = file[1]['owner']
                self.print(
                    f'\tFile: {file[0]}, blockSize: {block_size}, size: {size}, owner: {owner}'
                )
class HomuraFS():
    def __init__(self):
        self.client = Config().get_client('dev')
        self.prompt = 'homura_fs $ '
        self.name = None
        self.local_xml = None
        self.hdfs_xml = '.last_sync.xml'
        self.hdfs_loc_xml = None
        self.mount_root = None  #os.getcwd() + '/test'
        self.hdfs_root = '/cs219'
        self.meta = HomuraMeta()
        self.monitor = None
        if sys.platform.startswith('darwin'):
            logging.basicConfig(filename='mylog.log', level=logging.INFO)
            self.monitor = Monitor_Start()

    def shell_loop(self):
        while True:
            cmd = raw_input(self.prompt)

            if cmd == 'sync':
                print "Current devices attached:"
                id_mapping = dict()
                count = 1

                if len(self.monitor.devs) == 0:
                    print "No device attached"
                    continue
                for dev in self.monitor.devs:
                    #print dev
                    devname = dev['Dname']
                    manufacture = dev['Man']
                    hname = dev['Hname']
                    id_mapping[count] = dev
                    print "{}) Dname: {}, Hname: {}, Manufacture: {}.\n".format(
                        count, devname, hname, manufacture)
                    count += 1
                dev_id = int(raw_input("Which device to sync:\n"))

                if dev_id == 0:
                    continue

                if dev_id in id_mapping:
                    #self.name = id_mapping[dev_id]['UID']
                    self.name = ''
                    self.mount_root = id_mapping[dev_id]['Path']
                    self.local_xml = self.mount_root + '/.last_sync.xml'
                    self.hdfs_loc_xml = self.mount_root + '/.cur_hdfs.xml'
                    self.meta.myRootpath = self.mount_root

                    log('Mount root is ' + self.mount_root)
                    log('Device xml file is ' + self.local_xml)
                    log('HDFS xml file is ' + self.hdfs_xml)
                    log('Copy of HDFS xml stored at ' + self.hdfs_loc_xml)
                    log('Syncing files for device ' +
                        id_mapping[dev_id]['Dname'])
                    self.sync_files()
                else:
                    pass

            elif cmd == 'test':
                pass
                #log('Setting up test directory with default config')
                #self.__test()
            elif cmd == 'download':
                pass
            elif cmd == 'quit':
                if self.monitor:
                    Monitor_Stop(self.monitor)
                return

    def download_all(self):
        log('Downloading all files from HDFS to local device')
        try:
            self.create_file(self.mount_root, self.hdfs_root, 1)
            for dir_or_file in os.listdir(self.mount_root + self.hdfs_root):
                if not dir_or_file.startswith('.'):
                    shutil.move(
                        self.mount_root + self.hdfs_root + '/' + dir_or_file,
                        self.mount_root)
            shutil.rmtree(self.mount_root + self.hdfs_root)
        except:
            log('Something went wrog while downloading files')
            try:
                shutil.rmtree(self.mount_root + self.hdfs_root)
            except:
                pass

        self.meta.path2Xml(self.mount_root)
        self.meta.saveXml(self.local_xml, Xml='temp')

    def upload_all(self):
        log('Uploading all files from local device to HDFS')

        for dir_or_file in os.listdir(self.mount_root):
            if not dir_or_file.startswith('.'):
                try:
                    log('Uploading to ' + self.hdfs_root + '/' + dir_or_file)
                    self.client.upload(self.hdfs_root + '/' + dir_or_file,
                                       self.mount_root + '/' + dir_or_file,
                                       n_threads=0)
                except:
                    log('Warning: could not upload')

    def load_HDFS_XML(self):
        log("Attempting to fetch HDFS xml")
        self.update_file(self.hdfs_loc_xml, self.hdfs_xml, 1)
        log("Loading HDFS xml")
        self.meta.loadHDFSXml(self.hdfs_loc_xml)
        os.remove(self.hdfs_loc_xml)

    def sync_files(self):
        # check if we have an old snapshot xml
        if not os.path.isfile(
                self.local_xml
        ):  # snapshot doesn't exist, so download everything
            log("No local snapshot file was found at " + self.local_xml)
            self.meta.Snapshotdoc = self.meta.emptyXml()  # use empty
            try:
                # fetch HDFS xml and store locally
                self.load_HDFS_XML()

            except:
                self.meta.HDFSdoc = self.meta.emptyXml()

        else:
            log("Fetching local snapshot xml from " + self.local_xml)
            self.meta.loadSnapshotXml(self.local_xml)

            try:
                # fetch HDFS xml and store locally
                self.load_HDFS_XML()
            except:
                self.meta.HDFSdoc = self.meta.emptyXml()

        self.meta.path2Xml(self.mount_root)
        self.meta.mydoc = self.meta.tempdoc

        #print 'HDFS XML:'
        #self.meta.showHDFSXml()
        #print '---\nSnapshot Xml'
        #self.meta.showSnapshotXml()
        #print '---\nLocal Xml'
        #self.meta.showMyXml()

        # find operations since last sync
        (my_creates, my_deletes, my_modifies, hdfs_creates, hdfs_deletes,
         hdfs_modifies) = self.meta.getOperations()

        root = self.mount_root
        name = self.hdfs_root

        # apply operations on current device
        for path in my_creates:
            if path.endswith('/'):  # path is a folder we want to create
                os.makedirs(root + path)
            else:
                self.create_file(root + path, name + path, 1)
        for path in my_modifies:
            self.update_file(root + path, name + path, 1)
        for path in my_deletes:
            self.delete_file(root + path, 1)

        # apply operations on HDFS
        for path in hdfs_creates:
            if path.endswith('/'):  # path is a folder we want to create
                self.client.makedirs(name + path)
            else:
                self.create_file(root + path, name + path, 0)
        for path in hdfs_modifies:
            self.update_file(root + path, name + path, 0)
        for path in hdfs_deletes:
            self.delete_file(name + path, 0)

        # update last sync for both HDFS and current device
        self.meta.path2Xml(self.mount_root)
        self.meta.saveXml(self.local_xml, Xml='temp')
        self.update_file(self.local_xml, self.hdfs_xml, 0)

        return

    # in this set of functions, when kyuubey = 0, the operation goes
    # from loc to hdfs (i.e. local becomes the "master")
    # when kyuubey = 1, the operation goes from hdfs to loc
    # (i.e. hdfs becomes the "master")
    def create_file(self, loc_path, hdfs_path, kyuubey):
        if kyuubey == 0:
            log('Creating ' + hdfs_path + ' on HDFS')
            self.client.upload(hdfs_path, loc_path, n_threads=0)
        elif kyuubey == 1:
            log('Creating ' + loc_path + ' locally')
            self.client.download(hdfs_path, loc_path, n_threads=0)

    def update_file(self, loc_path, hdfs_path, kyuubey):
        if kyuubey == 0:  # updating file on HDFS
            log('Updating file ' + hdfs_path + ' on HDFS')
            with open(loc_path) as reader:
                with self.client.write(hdfs_path, overwrite=True) as writer:
                    for line in reader:
                        writer.write(line)
        elif kyuubey == 1:
            log('Updating file ' + loc_path + ' locally')
            with open(loc_path, 'w') as writer:
                with self.client.read(hdfs_path) as reader:
                    data = reader.read()
                    writer.write(data)

    def delete_file(self, path, kyuubey):
        if kyuubey == 0:  # delete file on HDFS
            log('Deleting file ' + path + ' from HDFS')
            self.client.delete(path, recursive=True)
        elif kyuubey == 1:  # delete file locally
            log('Deleting file ' + path + ' locally')
            os.remove(path)

    def move_file(self, src_path, dst_path, kyuubey):
        if kyuubey == 0:  # move file on HDFS
            log('Moving file from ' + src_path + ' to ' + dst_path +
                ' on HDFS')
            self.client.rename(src_path, dst_path)
        elif kyuubey == 1:  # move file locally
            os.rename(src_path, dst_path)
            log('Moving file from ' + src_path + ' to ' + dst_path +
                ' locally')

    def __test(self, test_no=1):
        self.__reset_test()
        if test_no == 1:
            self.__config_basic()
        elif test_no == 2:
            self.__config_outer_empty()

    def __reset_test(self):
        root = self.mount_root
        log('Resetting mount directory')
        if os.path.exists(root):
            shutil.rmtree(root)
        os.makedirs(root)

    def __config_basic(self):
        root = self.mount_root
        log('Config 1: default')
        with open(root + '/test1.txt', 'w') as writer:
            writer.write('hi\nthere\n!\n')
        with open(root + '/test2.txt', 'w') as writer:
            writer.write('one-liner')
        with open(root + '/test3.txt', 'w') as writer:
            writer.write('')
        os.makedirs(root + '/subdir')
        with open(root + '/subdir/test1.txt', 'w') as writer:
            writer.write('a different\ntest1.txt\nfile!\n')
        os.makedirs(root + '/subdir/subsubdir')
        with open(root + '/subdir/subsubdir/test1.txt', 'w') as writer:
            writer.write('yet another different\ntest1.txt\nfile!\n')

    def __config_outer_empty(self):
        root = self.mount_root
        log('Config 2: outer directory empty')
        os.makedirs(root + '/subdir')
        with open(root + '/subdir/test1.txt', 'w') as writer:
            writer.write('a different\ntest1.txt\nfile!\n')
        os.makedirs(root + '/subdir/subsubdir')
        with open(root + '/subdir/subsubdir/test1.txt', 'w') as writer:
            writer.write('yet another different\ntest1.txt\nfile!\n')
Example #8
0
def main():
    arg = parsing_options()
    krb_client = Config(path=arg.hdfsConf).get_client()
    az_conf = read_conf(arg.azureConf)
    az_client = az_key_vault_connection(az_conf['azure_client_id'],
                                        az_conf['azure_client_secret'],
                                        az_conf['azure_tenant_id'])
    az_rsa_key = az_get_rsa_key_info(az_client, az_conf['key_vault'],
                                     az_conf['key_name'])
    column = list(itertools.chain.from_iterable(arg.column))
    with krb_client.read(arg.input) as inputFile:
        with krb_client.write(arg.output,
                              overwrite=arg.overwrite) as outputFile:
            if arg.operation == 'encrypt':
                aes_key = generate_aes_key()
                az_conf['uuid'] = str(uuid.uuid4())
                encrypt_and_store_aes_key(az_client, az_conf,
                                          az_rsa_key['version'],
                                          base64.b64encode(aes_key))
                df = pd.read_csv(inputFile,
                                 sep=arg.delimiter,
                                 header=arg.header,
                                 dtype=str,
                                 chunksize=10000)
                num_chunk = 0
                for chunk in df:
                    # Generate new column name and hash in place
                    new_column = []
                    for i in column:
                        new_column.append(str(i) + '_HASH')
                    chunk[new_column] = chunk[column].apply(hash_value)
                    # Encrypt in place
                    chunk[column] = chunk[column].apply(encrypt,
                                                        args=(aes_key,
                                                              az_conf['uuid']))
                    if num_chunk == 0:
                        chunk.to_csv(outputFile,
                                     sep=arg.delimiter,
                                     header=True,
                                     index=False)
                        num_chunk += 1
                    else:
                        chunk.to_csv(outputFile,
                                     sep=arg.delimiter,
                                     header=False,
                                     index=False)
            else:
                df = pd.read_csv(inputFile,
                                 sep=arg.delimiter,
                                 header=arg.header,
                                 dtype=str,
                                 chunksize=1000)
                num_chunk = 0
                for chunk in df:
                    if num_chunk == 0:
                        # spliting only the first column - grabbing the 3rd field (key) and grabbing the value [0]
                        key = base64.b64decode(chunk[column[0]].str.split(
                            pat='-', n=3, expand=True)[3][0])
                        aes_key = retrieve_and_decrypt_aes_key(
                            az_client, az_conf, az_rsa_key['version'], key)
                    chunk[column] = chunk[column].apply(decrypt,
                                                        args=(aes_key, ))
                    if num_chunk == 0:
                        chunk.to_csv(outputFile,
                                     sep=arg.delimiter,
                                     header=True,
                                     index=False)
                        num_chunk += 1
                    else:
                        chunk.to_csv(outputFile,
                                     sep=arg.delimiter,
                                     header=False,
                                     index=False)