Example #1
0
def do_read_labels(file_uri):
    o = urlparse(file_uri)
    t = '/tmp/image.dat.' + str(os.getpid())

    try:
        if o.scheme != 'hdfs':
            with open(o.path, 'rb') as fd:
                labels_train, labels_test = pk.load(fd)
        else:
            if os.path.exists(t):
                os.remove(t)
            client = Client(o.hostname, o.port)
            for f in client.copyToLocal([o.path], t):
                if f['result'] == True:
                    with open(t, 'rb') as fd:
                        labels_train, labels_test = pk.load(fd)
                    os.remove(t)
                else:
                    print('File ' + f['path'] + ' NOT copied because "' +
                          str(f['error']) + '", sorry !')
                    return None, None
    except:
        print('Exception ' + str(sys.exc_info()[0]) + ' on file ' + file_uri)
        return None, None

    return labels_train, labels_test
Example #2
0
def gethdfsfile(request):

    if request.method == "GET":
        file_id = request.GET["file_id"]
        file_name = request.GET["file_name"]

        file_name = urllib.unquote(file_name).encode("utf-8")

        ### временный файл
        tfile = "/tmp/{file_id}".format(file_id=file_id)


        client = Client('10.6.0.135', 9000)
        for x in client.copyToLocal(['/blocks/%s' % file_id], tfile):
            print x

        f = open(tfile, 'r')
        data = f.read()
        f.close()

        ### Удаление временного файла
        os.remove(tfile)

        content_type =  mimetypes.types_map[".%s" % file_name.split('.')[-1]]

        response = HttpResponse(data, content_type=content_type)
        response['Content-Disposition'] = 'attachment; filename="%s"' % file_name
        return response
Example #3
0
def main(argv):
    hdfs_namenode = os.environ['HDFS_NAMENODE']
    model_on_hdfs = os.environ['MODEL_ON_HDFS']
    ip, port = hdfs_namenode.rsplit(':', 1)
    client = Client(ip, int(port), use_trash=False)
    dst_dir = os.path.join('/')
    for x in client.copyToLocal([model_on_hdfs], dst_dir):
        print x

    zk_master = os.environ['ZK_MASTER']

    logger.info('job_name: {0}, task_index: {1}'.format(
        os.environ['JOB_NAME'], os.environ['TASK_INDEX']))
    logger.info('command: {0}'.format(os.environ['CMD']))

    zk = KazooClient(hosts=zk_master)
    zk.start()

    logger.info('job uid: {0}'.format(os.environ['UID']))
    job_zk_dir = '/' + os.environ['UID']

    members = zk.get_children(job_zk_dir + '/member/')
    members.sort()

    cluster_def = {}
    for member in members:
        host = zk.get(job_zk_dir + '/member/' + member)[0]
        if host != '':
            logger.info('{0} running on {1}'.format(member, host))
            job_type = member.split('_')[2]

            if job_type == 'ps':
                cluster_def.setdefault('ps', []).append(host)
            elif job_type == 'worker':
                cluster_def.setdefault('worker', []).append(host)
            else:
                logger.error('unkown type: {0}'.format(job_type))

    ps = ','.join(cluster_def['ps'])
    worker = ','.join(cluster_def['worker'])

    my_env = os.environ.copy()
    logger.info(my_env)
    my_env['PS'] = ps
    my_env['WORKER'] = worker

    cmd = [os.environ['CMD']]
    child = subprocess.Popen(cmd, shell=True, env=my_env)

    child.wait()
    zk.stop()
def get_json_object():

    client = Client('localhost', 9000) #merged filename is hardcoded , you hav to keep changing the filename for every analysis!!
    for a in client.copyToLocal(['/user/flume/tweets/merged_20210102123709.json'], '/home/manojkhatokar/Downloads/BGD/final_python_scripts/merged_data'):
        print(a)


    # with open('/home/manojkhatokar/Downloads/merged_20210101142527.json') as f:
    #     raw_data = f.read().splitlines()[-1]
    #     list_data = f'[{raw_data}]'
    #     json_data = json.loads(list_data)
    #     print(json_data)

    json_file =  open('/home/manojkhatokar/Downloads/BGD/final_python_scripts/merged_data/merged_20210102123709.json')
    json_object = json.load(json_file)
    json_file.close
    return json_object
def getTrainedModel(hdfsServer, modelFile):
    hdfsPort = int(os.environ.get('HDFS_NAME_PORT', 8020))
    modelSavePath = "/user/" + os.getenv('LOGNAME') + "/data/model/" + modelFile + '/'

    # Load the saved model data
    hdfs_client = Client(hdfsServer, hdfsPort)
    filesInfo = hdfs_client.ls([modelSavePath])

    # Copy HDFS files to local temp directory
    # First clean up and recreate the temp folder
    copyDir = tempfile.gettempdir() + "/" + modelFile
    shutil.rmtree(copyDir, ignore_errors=True)
    os.makedirs(copyDir)
    res = hdfs_client.copyToLocal([f['path'] for f in filesInfo], copyDir)
    for r in res:
        if not r['result']:
            print "Error: %s" % r

    modelFilePath = copyDir + '/' + modelFile
    print "Load model from  %s" % modelFilePath
    return joblib.load(modelFilePath)
Example #6
0
def getObjsBackend(objs, backend, config):

    if(backend == 'hdfs'):

        client = Client(socket.gethostname(), config['HADOOP_RPC_PORT'], use_trash=False)

        for obj in objs:
                try:
                    copy_gen = client.copyToLocal([obj[0]], obj[1])
                    for copy_item in copy_gen:
                        pass
                except Exception as e:
                        print(e)
    elif(backend == 'swift'):

        options = {'os_auth_url': os.environ['OS_AUTH_URL'], 'os_username': os.environ['OS_USERNAME'], 'os_password': os.environ['OS_PASSWORD'], 'os_tenant_id': os.environ['OS_TENANT_ID'], 'os_tenant_name': os.environ['OS_TENANT_NAME']}
        swiftService = SwiftService(options=options)

        for obj in objs:

            # Create the containers which are used in this application for Object Storage
            if(obj[0] == 'sqlite.db'):
                swiftService.post(container='containerFiles')
                swiftService.post(container='containerFeatures')
                swiftService.post(container='containerModules')

            out_file = obj[1]  # Get the output file location from runner
            localoptions = {'out_file': out_file}
            objects = []
            objects.append(obj[0])
            swiftDownload = swiftService.download(container='containerModules', objects=objects, options=localoptions)

            for downloaded in swiftDownload:
                if("error" in downloaded.keys()):
                    raise RuntimeError(downloaded['error'])
                # print(downloaded)

    elif(backend == 'nfs'):  # Every file is already in respective local dirs
        pass
from snakebite.client import Client

client = Client('localhost', 9000)
for f in client.copyToLocal(['/input/input.txt'], '/tmp'):
   print f
Example #8
0
class DataGenerator(object):

    'Generates data for Keras'
    '''
    Initialization function of the class
    '''
    def __init__(self,
                 height=28,
                 width=28,
                 channels=1,
                 batch_size=32,
                 cache_mode='',
                 images_uri='/',
                 shuffle=True):
        'Initialization'
        self.debug = False
        self.height = height
        self.width = width
        self.channels = channels
        self.batch_size = batch_size
        self.shuffle = shuffle
        self.cache_mode = cache_mode
        self.images_uri = images_uri
        o = urlparse(self.images_uri)
        if o.scheme == 'hdfs':
            self.images_path = o.path
            self.client = Client(
                o.hostname, o.port
            )  # images_uri: 'hdfs://10.0.40.19:9600/daloflow/dataset32x32/'
        else:
            self.images_path = images_uri
            self.client = None

    '''
    Set debug mode True/False
    '''

    def set_debug(self, debug_mode):
        'Do not show or show messages'
        self.debug = debug_mode
        if self.debug == True:
            print(' * Debug mode:  ' + self.debug)
            print(' * Height:      ' + self.height)
            print(' * Width:       ' + self.width)
            print(' * Channels:    ' + self.channels)
            print(' * Batch_size:  ' + self.batch_size)
            print(' * Shuffle:     ' + self.shuffle)
            print(' * Cache mode:  ' + self.cache_mode)
            print(' * Image uri:   ' + self.images_uri)

    '''
    Goes through the dataset and outputs one batch at a time.
    '''

    def generate(self, labels, list_IDs, yield_labels=True):
        'Generates batches of samples'

        # Infinite loop
        while 1:
            # Generate random order of exploration of dataset (to make each epoch different)
            indexes = self.__get_exploration_order(list_IDs)

            # Generate batches
            imax = int(len(indexes) / self.batch_size)  # number of batches

            for i in range(imax):
                # Find list of IDs for one batch
                list_IDs_temp = [
                    list_IDs[k] for k in indexes[i * self.batch_size:(i + 1) *
                                                 self.batch_size]
                ]

                # Train, validation
                X, y = self.__data_generation(labels, list_IDs_temp,
                                              yield_labels)

                yield X, y

    '''
    Generates a random order of exploration for a given set of list_IDs.
    If activated, this feature will shuffle the order in which the examples
    are fed to the classifier so that batches between epochs do not look alike.
    Doing so will eventually make our model more robust.
    '''

    def __get_exploration_order(self, list_IDs):
        'Generates order of exploration'

        # Find exploration order
        indexes = np.arange(len(list_IDs))

        if self.shuffle == True:
            np.random.shuffle(indexes)

        return indexes

    '''
    Get data: local
    '''

    def __get_data_local(self, image_file_name):
        'Get data from local file system path'
        pixels = None

        try:
            with open(image_file_name, 'rb') as image_file:
                pixels = np.fromstring(zlib.decompress(image_file.read()),
                                       dtype=np.uint8,
                                       sep='').reshape(self.height, self.width,
                                                       self.channels)
        except:
            if self.debug == True:
                print('Exception ' + str(sys.exc_info()[0]) + ' on file ' +
                      image_file_name)

        return pixels

    '''
    Get data: remote
    '''

    def __get_data_remote(self, image_file_name):
        'Get data from HDFS'
        pixels = None
        if self.client == None:
            return pixels

        try:
            t = '/tmp/image.dat.' + str(os.getpid())
            if os.path.exists(t):
                os.remove(t)
            for f in self.client.copyToLocal([image_file_name], t):
                if f['result'] == True:
                    with open(t, 'rb') as image_file:
                        pixels = np.fromstring(
                            zlib.decompress(image_file.read()),
                            dtype=np.uint8,
                            sep='').reshape(self.height, self.width,
                                            self.channels)
                    os.remove(t)
                else:
                    print('File ' + f['path'] + ' NOT copied because "' +
                          str(f['error']) + '", sorry !')
        except:
            if self.debug == True:
                print('Exception ' + str(sys.exc_info()[0]) + ' on file ' +
                      image_file_name)

        return pixels

    '''
    Get data: local or remote
    '''

    def __get_data(self, image_file_name):
        'Get data: local or remote'
        pixels = None
        #print(' * image file name: ' + image_file_name)

        if self.cache_mode == 'hdfs2local' or self.cache_mode == 'hdfs2local-full':
            pixels = self.__get_data_local(image_file_name)
        elif self.cache_mode == 'nocache':
            pixels = self.__get_data_remote(image_file_name)
        elif self.cache_mode == 'hdfs2local-partial':
            pixels = self.__get_data_local(image_file_name)
            if pixels == None:
                pixels = self.__get_data_remote(image_file_name)
        else:
            print('ERROR: unknown "' + self.cache_mode + '" cache mode')

        return pixels

    '''
    Outputs batches of data and only needs to know about the list of IDs included
    in batches as well as their corresponding labels.
    '''

    def __data_generation(self, labels, list_IDs_temp, yield_labels):
        'Generates data of batch_size samples'  # X : (n_samples, v_size, v_size, v_size, n_channels)

        # Initialization
        X = np.empty((self.batch_size, self.height, self.width, self.channels),
                     dtype='float32')
        y = np.empty((self.batch_size), dtype='float32')

        # Generate data
        for i, ID in enumerate(list_IDs_temp):
            # Decompress image into pixel NumPy tensor
            image_file_name = self.images_path + '/'.join(
                ID.split('/')[1:]) + '.tar.gz'

            # Read image
            pixels = self.__get_data(image_file_name)

            # Store volume
            #pixels = np.rollaxis(pixels, 0, 3) # from 'channels_first' to 'channels_last'
            X[i, :, :, :] = pixels

            # get y value
            y_value = labels[ID]
            y[i] = y_value

        # return X and Y (train, validation)
        return X, y

    '''
    Please note that Keras only accepts labels written in a binary form
    (in a 6-label problem, the third label is writtten [0 0 1 0 0 0]),
    which is why we need the sparsify function to perform this task,
    should y be a list of numerical values.
    '''

    def sparsify1(self, y):
        'Returns labels in binary NumPy array'
        return np.array([[1 if y[i] == j else 0 for j in range(10)]
                         for i in range(y.shape[0])])
Example #9
0
from snakebite.client import Client
import json
import boto3
import os

## Set auth keys
with open('/home/n/opt/MindBender_BD/Misc/keys') as keys:
    s3_keys = json.load(keys)
    access_key = s3_keys["s3_python_test"]["access_key"]
    secret_access_key = s3_keys["s3_python_test"]["secret_access_key"]

s3 = boto3.client('s3', aws_access_key_id=access_key, aws_secret_access_key=secret_access_key)

## Connect to HDFS with Snakebite
client = Client('localhost', 9000)

## Move file locally (temporarily)
for f in client.copyToLocal(['/spark/data.json'], '/home/n/opt/MindBender_BD/Task-021/tmp'):
   print("Moved one file.")

## Upload temp file to S3
s3.upload_file('/home/n/opt/MindBender_BD/Task-021/tmp/data.json', 'mindbender0001', 'data.json')

## Delete tmp file
os.remove('/home/n/opt/MindBender_BD/Task-021/tmp/data.json')
Example #10
0
def copy():
	client = Client("study", 9000, use_trash=False)
	client.copyToLocal(["/data/gz"],"/root/data/",check_crc=False)
Example #11
0
from snakebite.client import Client
client = Client('localhost', 8020)  #port is the RPC port of the namenode.
for i in client.ls(['/user/cloudera/behrouz']):  #takes a list of paths!!
    print i
#get this parameters from /etc/hadoop/conf/core-site.xml under the fs.defaults
#many of the methods in snake bite return generators

#creating a directory:
#create two directories behrouz, behrouz1/b1 on HDFS:
print '*' * 40
for p in client.mkdir(['/behrouz', 'behrouz1/b1'], create_parent=True):
    print p
print '*' * 40
#deleting files and directories: deletes any subdirectories and files a directory contains
#recursively deleting the directories!
for p in client.delete(['/behrouz', 'behrouz1/b1'], recurse=True):
    print p
print '*' * 40
# retrieving data from hdfs:
#copying files from HDFS to Local file system:
for f in client.copyToLocal(['/user/cloudera/wordCount.out'],
                            '/home/cloudera/'):
    print f
print '*' * 40
#######
#reading contents of a file
for l in client.text(['/user/cloudera/testfile.txt']):
    print l
#the text method automatically decompress and display gzip and bzip2 files.
Example #12
0
#!/user/local/bin/python
from snakebite.client import Client

client = Client('localhost', 9000)

for f in client.copyToLocal(['/user/cbohara/book.txt'], '/tmp'):
    print f
Example #13
0
# Создадим пару директорий
for p in client.mkdir(['/student9_7/py_dir_01', '/student9_7/py_dir_02'],
                      create_parent=True):
    print(p)
'''
{'path': '/student9_7/py_dir_01', 'result': True}
{'path': '/student9_7/py_dir_02', 'result': True}
'''

# Удалим директорию `py_dir_01`
for p in client.delete(['/student9_7/py_dir_01'], recurse=True):
    print(p)
'''
{'path': '/student9_7/py_dir_01', 'result': True}
'''

# Посмотрим что содержится в файле `test`
for t in client.text(['/student9_7/test']):
    print(t)
'''
test file for hdfs
'''

# Скопируем файл `test` из хранилища в локальную домашнюю директорию под именем `retrived_file_via_py`
for f in client.copyToLocal(['/student9_7/test'], 'retrived_file_via_py'):
    print(f)
'''
{'path': '/home/student9_7/retrived_file_via_py', 'source_path': '/student9_7/test', 'result': True, 'error': ''}
'''