Example #1
0
class DataProcessor:
    def __init__(self, data_path=None):
        if data_path is None:
            self.data_path = r'./config/connect_info.json'
        else:
            assert type(data_path) == str
            self.data_path = data_path
        if not os.path.exists(self.data_path):
            self.data_path = r'./connect_info.json'

        with open(self.data_path) as data_file:
            data = json.load(data_file)
            self.hdfs_client = InsecureClient(
                url='http://' + data['namenode_url'] + ':' + str(data['port']),
                user=data['user'],
                root=data['root_path'])
            self.img_dir = data['img_dir']

        if self.img_dir[-1] != '/':
            self.img_dir += '/'

        self.file_name = 1

    def InitImgDir(self):
        try:
            list_rslt = self.hdfs_client.list(self.img_dir)
            if len(list_rslt) > 0:
                for name in list_rslt:
                    file_path = self.img_dir + name
                    self.hdfs_client.delete(file_path)

        except util.HdfsError:
            self.hdfs_client.makedirs(self.img_dir)
            print("Mkdir ...")

        return True

    def DataProcess(self, data, append=False, file_name=None):
        assert type(data) == str
        if file_name is None:
            file_name = self.img_dir + str(self.file_name)
        else:
            assert (type(file_name)) == str
        print("start writing...")
        start = time.time()
        self.hdfs_client.write(file_name,
                               data,
                               overwrite=True,
                               replication=1,
                               append=append)
        delta = time.time() - start
        print("writing complete, time delta is " + str(delta))
        return True

    def Upload(self, remote_name, local_path):
        assert os.path.exists(local_path)

        remote_path = self.img_dir + remote_name
        self.hdfs_client.upload(remote_path, local_path, True)
        return True
Example #2
0
def delete_data(request):
    response_content = {}
    response = HttpResponse()
    try:
        proj_id = request.GET.get('proj_id')
        data_id = request.GET.get('data_id')
        user_id = request.GET.get('user_id')
        fetched = Datasets.objects.filter(proj_id=proj_id,
                                          data_id=data_id,
                                          user_id=user_id).values('hdfs_path')
        if len(fetched) == 0:
            raise Exception('Oops! No access!')
        if list(fetched)[0]['hdfs_path']:
            client = InsecureClient("http://hdfs.neurolearn.com:50070",
                                    user="******")
            client.delete(list(fetched)[0]['hdfs_path'], recursive=True)
        Datasets.objects.filter(proj_id=proj_id,
                                data_id=data_id,
                                user_id=user_id).delete()

        response_content['msg'] = 'success'
        response_content['error_num'] = 0
    except Exception as e:
        response_content['msg'] = str(e)
        response_content['error_num'] = 1

    response.write(json.dumps(response_content))

    return response
Example #3
0
class SavedModelUploader(object):
    """upload a saved model to hadoop file system"""
    def __init__(self, url, user, base_path=""):
        self._logger = logging.getLogger(self.__class__.__name__)
        self._url = url
        self._user_ = user
        self._base_path = base_path
        self._client = InsecureClient(url, user)

        if not self._exist(base_path):
            self._mkdir(base_path)

    def _exist(self, path):
        if self._client.content(path, strict=False):
            return True
        else:
            return False

    def _mkdir(self, path):
        self._client.makedirs(path)

    def _del(self, path):
        self._client.delete(path, recursive=True)

    def _upload(self, local_path, hdfs_path):
        self._client.upload(hdfs_path, local_path)

    def _logging_progress(self, local_path, nbytes):
        msg = None
        if nbytes > 0:
            msg = "uploading: '{}' [{} bytes]".format(local_path, nbytes)
        else:
            msg = "uploading: '{}' [done]".format(local_path)
        self._logger.info(msg)

    def upload(self, local_model_path, overwrite=False):
        hdfs_model_path = self._base_path + '/' + basename(local_model_path)

        existed = self._exist(hdfs_model_path)
        if overwrite and existed:
            self._del(hdfs_model_path)
        elif not overwrite and existed:
            raise RuntimeError(
                "could not overwrite the model, already existed.")

        try:
            self._client.upload(self._base_path,
                                local_model_path,
                                progress=self._logging_progress)
        except HdfsError as e:
            self._logger.error(e)

        self._logger.info("model upload done")
Example #4
0
class HdfsDb(object):
    HOST = '192.168.71.156'
    PORT = 50070
    USER = '******'
    HOST_URI = 'http://{0}:{1}'.format(HOST, PORT)

    def __init__(self):
        self.client = InsecureClient(self.HOST_URI, user=self.USER)

    @check_dir_path
    def list_dir(self, dir_path=None):
        """
        列出根目录
        :return:
        """
        dir_data = self.client.list(dir_path)
        return dir_data

    @check_dir_path
    def mk_dir(self, dir_path=None):
        self.client.makedirs(dir_path)

    def write_file(self, filename, data, dir_path=None):
        """
        写入文件
        hd.write_file('test.json', {'name': 'zhexiao'}, dir_path='/data')
        :param filename:
        :param data:
        :param dir_path:
        :return:
        """
        file_path = '{0}/{1}'.format(dir_path, filename)
        self.client.write(file_path, str(data))

    @check_dir_path
    def read_file(self, filename, dir_path=None):
        """
        读取文件数据
        filedata = hd.read_file('README.txt', dir_path='/data')
        :param filename:
        :param dir_path:
        :return:
        """
        file_path = '{0}/{1}'.format(dir_path, filename)

        with self.client.read(file_path, encoding='utf-8') as reader:
            for line in reader:
                yield line

    @check_dir_path
    def delete(self, filename, dir_path=None):
        file_path = '{0}/{1}'.format(dir_path, filename)
        self.client.delete(file_path)
Example #5
0
def delete_hdfs_file(remove_from_local_hdfs, schema, table_name):
    # removing hdfs temporary files
    if remove_from_local_hdfs:
        # get private ip to connect to hdfs
        import socket
        private_ip = socket.gethostbyname(socket.gethostname())

        try:
            hdfs_client = InsecureClient(
                url="http://{}:8020".format(private_ip), user="******")
            hdfs_client.delete("/user/hadoop/{}/{}".format(schema, table_name),
                               recursive=True)
        except Exception as error:
            logging.error(error)
Example #6
0
def upload_to_hdfs(input_dir, output_dir, chunk_size):
    # locate files in directory
    files = [
        os.path.abspath("{}/{}".format(input_dir, f))
        for f in listdir(input_dir) if isfile(join(input_dir, f))
    ]
    tmp_dir = "{}/tmp".format(input_dir)

    # setup temp dir
    if os.path.isdir(tmp_dir):
        shutil.rmtree(tmp_dir)
    os.mkdir(tmp_dir)

    # split files into 128mb chunks
    for f in files:
        fs = FileSplit(file=f,
                       splitsize=(chunk_size) * 1e6,
                       output_dir=tmp_dir)
        fs.split(callback=split_callback)

    # upload to hdfs
    hdfs_client = InsecureClient("http://{}:9870".format(
        settings.HDFS_HOST_VALUE),
                                 user=settings.HDFS_USER_VALUE)

    # delete existing output dir
    if hdfs_client.content(output_dir, strict=False) != None:
        hdfs_client.delete(output_dir, recursive=True)

    # upload files to tmp dir
    remote_path = hdfs_client.upload(hdfs_path="/tmp",
                                     local_path=tmp_dir,
                                     n_threads=-1,
                                     overwrite=True)

    # rename to output_dir
    hdfs_client.rename("/tmp", output_dir)

    print(
        "{} files uploaded to hdfs host '{}{}'  ({} file chunks total)".format(
            len(files),
            settings.HDFS_HOST_VALUE,
            output_dir,
            len(split_files),
        ))
    # delete temp files
    shutil.rmtree(tmp_dir)

    return hdfs_file_paths
Example #7
0
class DataProcessor:
    def __init__(self, data_path=None):
        if data_path == None:
            self.data_path = r'./config/connect_info.json'
        else:
            assert type(data_path) == str
            self.data_path = data_path
        if not os.path.exists(self.data_path):
            self.data_path = r'./connect_info.json'

        with open(self.data_path) as data_file:
            data = json.load(data_file)
            print("Data: ", data)
            self.hdfs_client = InsecureClient(
                url='http://' + data['namenode_url'] + ':' + str(data['port']),
                user=data['user'],
                root=data['root_path'])
            print("hdfs client: ", self.hdfs_client)
            self.img_dir = data['img_dir']
            print("img dir: ", self.img_dir)

        if self.img_dir[-1] != '/':
            self.img_dir += '/'
        else:
            pass

        self.file_name = 1

    def InitImgDir(self):
        try:
            list_rslt = self.hdfs_client.list(self.img_dir)
            if len(list_rslt) > 0:
                for name in list_rslt:
                    file_path = self.img_dir + name
                    self.hdfs_client.delete(file_path)

        except util.HdfsError:
            self.hdfs_client.makedirs(self.img_dir)

        return True

    def Upload(self, file_path, threads=2):
        print("FilePath: ", file_path)
        print("img_dir: ", self.img_dir[:-1])
        self.hdfs_client.upload(hdfs_path=self.img_dir[:-1],
                                local_path=file_path,
                                n_threads=threads,
                                overwrite=True)
        return 0
Example #8
0
class HdfsWrapper:
    def __init__(self):
        self.client = None

    def connect_hdfs(self):
        self.client = InsecureClient(CONST.HDFS_URL, user=CONST.HDFS_USER)

    def mkdir_hdfs(self, path):
        if not exists(path):
            self.client.makedirs(path)

    def list_hdfs(self, path):
        return self.client.list(path)

    def read_hdfs(self, hdfs_path):
        try:
            with self.client.read(hdfs_path) as reader:
                return reader.read()
        except:
            log.error(traceback.format_exc())
            self.connect_hdfs()
            log.error('reconnect hdfs...')

    def write_hdfs(self, hdfs_path, data, overwrite=False):
        try:
            with self.client.write(hdfs_path, overwrite=overwrite) as writer:
                writer.write(data)
            return hdfs_path
        except:
            log.error(traceback.format_exc())
            self.connect_hdfs()
            log.error('reconnect hdfs...')

    def delete_hdfs(self, hdfs_path, recursive=False):
        return self.client.delete(hdfs_path, recursive)
Example #9
0
class Storage:
    def __init__(self, protocol: str = 'webHDFS', *args, **kwargs):
        self.protocol, self.client = protocol.lower(), None
        if protocol.lower() == 'webHDFS'.lower():
            from hdfs import InsecureClient
            self.client = InsecureClient(*args, **kwargs)
            for f in 'upload download list status delete'.split():
                setattr(self, f, getattr(self,
                                         '%s_%s' % (f, protocol.lower())))

    def upload_webhdfs(self, local_path: str, remote_path: str, **kwargs):
        to_screen("upload %s -> %s" % (local_path, remote_path))
        return self.client.upload(local_path=local_path,
                                  hdfs_path=remote_path,
                                  **kwargs)

    def download_webhdfs(self, remote_path: str, local_path: str, **kwargs):
        mkdir_for(local_path)
        to_screen("download %s -> %s" % (remote_path, local_path))
        return self.client.download(local_path=local_path,
                                    hdfs_path=remote_path,
                                    overwrite=True,
                                    **kwargs)

    def list_webhdfs(self, remote_path: str, **kwargs):
        return self.client.list(hdfs_path=remote_path, **kwargs)

    def status_webhdfs(self, remote_path: str, **kwargs):
        return self.client.status(hdfs_path=remote_path, **kwargs)

    def delete_webhdfs(self, remote_path: str, **kwargs):
        return self.client.delete(hdfs_path=remote_path, **kwargs)
Example #10
0
def init_context(context):
    key_count = defaultdict(int)
    setattr(context.user_data, "key_count", key_count)

    # init HDFS
    hdfs_host = os.environ.get("HDFS_HOST")
    hdfs_user = os.environ.get("HDFS_USER")
    output_path = os.environ.get("REDUCER_OUTPUT_FILENAME")
    hdfs_client = InsecureClient("http://{}:9870".format(hdfs_host),
                                 user=hdfs_user)

    # delete existing output file
    if hdfs_client.content(output_path, strict=False) != None:
        hdfs_client.delete(output_path)

    setattr(context.user_data, "hdfs_client", hdfs_client)
Example #11
0
def delJPG_Newmodel(basepath):
    now_time = datetime.datetime.now()
    now_date_str = now_time.strftime('%Y%m%d')
    now_date = datetime.datetime.strptime(now_date_str, '%Y%m%d')
    try:
        # client = Client('http://10.41.158.72:50070')

        # client = InsecureClient('http://10.41.158.106:50075', user='******')
        client = InsecureClient('http://10.41.158.65:50070', user='******')
        # path="/P8AOI"
        # path1="C:/Users/z18073048/Desktop/bigdata/X1778-ANSI-BOT_20200813_TB1-F11-TRI-05@20200813094718-FPW03354EX3P49WBS.JPG"
        # client.upload(path,path1,cleanup=True)
        folderlist = client.list(basepath)
        newmodel = getnewmodel()
        for i in range(len(folderlist)):
            if isinstance(folderlist[i], unicode):
                #if isinstance(folderlist[i],list):
                folderlist[i] = folderlist[i].decode('string_escape')
            fname = folderlist[i]
            #print folderlist[i]
            #if  (fname=='X1777' or fname=='X1778' or fname=='Errormodel'):
            if (fname in newmodel):
                folderlist1 = client.list(basepath + '/' + fname)
                print 'newmodel:', folderlist[i], folderlist1
                for i in range(len(folderlist1)):
                    if isinstance(folderlist1[i], list):
                        folderlist1[i] = folderlist1[i].decode('string_escape')
                    date_flag = is_valid_date(folderlist1[i])
                    #print date_flag
                    if date_flag == 'true':
                        folderItem = datetime.datetime.strptime(
                            folderlist1[i], '%Y%m%d')
                        if folderItem + datetime.timedelta(
                                days=365) <= now_date:
                            paths = basepath + fname + '/' + folderlist1[i]
                            delHbase(folderlist1[i], client, paths)
                            deleteKudu(folderlist1[i], client, paths)

                            try:
                                client.delete(paths, recursive=True)
                                print paths + ' is delete'

                            except Exception as e:
                                print e
    except Exception as e:
        print e
def orchestrationTraining():
    hdfs_cli = InsecureClient('http://192.168.1.4:9870', user='******')

    hdfs_cli.delete('/images', recursive=True)
    hdfs_cli.delete('/images_augmented', recursive=True)
    hdfs_cli.delete('/images_crop', recursive=True)
    hdfs_cli.delete('/images_norm', recursive=True)

    data = request.get_json()

    url = data["url_db"]

    classifiers = data["classifiers"]

    list_algo = []

    for algo in classifiers:
        if not (algo in list_algo_deep) and not (algo in list_algo_ml):
            return algo + ' is an incorrect algo.'
        list_algo.append(algo)

    orch = Orchestration(url, list_algo)
    list_returns_trains = orch.run()

    string_result = '{ \"returns_trains\": {'
    for i in range(len(list_returns_trains)):
        string_result += list_returns_trains[i]
        if i == len(list_returns_trains) - 1:
            string_result += '}}'
        else:
            string_result += ','
    return json.loads(string_result)
Example #13
0
    def launcher(self):
        """ Send remove checkpoints task """

        # Connect
        client = InsecureClient('http://{ip}:{port}'.format(
            ip=self.namenode_ip, port=self.namenode_port),
                                user=self.file_user)

        # Get current timestamp
        timenow = calendar.timegm(datetime.datetime.now().timetuple())
        unix_timestamp = int(timenow * 1000)
        onehour = 3600000
        todelete = int(unix_timestamp - onehour)

        # Return file name list
        for directory in self.directories:
            fnames = client.list(directory, status=True)

            # Fetch list and sets modificationTime
            for fname in fnames:
                ctime = fname[1]['modificationTime']
                if ctime <= todelete:
                    dirtodelete = fname[1]['pathSuffix']
                    client.delete('{directory}/{dirtodelete}'.format(
                        directory=directory, dirtodelete=dirtodelete),
                                  recursive=True)
                    l.info(
                        'Removing {dir} ...Removed!'.format(dir=dirtodelete))
                    message = self.deleteddirs.append(dirtodelete)
                else:
                    l.info(
                        'Nothing to remove into {directory}. Bye bye!'.format(
                            directory=directory))

        if message:
            stdout = message
        else:
            stdout = 'No directories were deleted.'

        return {'Deleted directories': stdout}
def download_file(path, test_case_number, task_number):
    try:
        client = InsecureClient(
            ('http://' + HADOOP_HOST_NAME + ':' + HADOOP_NAMENODE_PORT_NUMBER),
            user=HADOOP_USER_NAME)
    except:
        print("Error connecting to hdfs client")
        return
    try:
        client.download(
            HADOOP_OUTPUT_PATH + task_number + test_case_number + "/",
            os.path.join(path, test_case_number))
    except Exception as e:
        print(e)
        print("Error downloading output file from hdfs")
        return
    try:
        client.delete(HADOOP_OUTPUT_PATH + task_number + test_case_number,
                      recursive=True)
    except:
        print("Error deleting hdfs output directory")
        return
Example #15
0
def full_load(tables, cur):
    for table in tables:
        tableName = table
        ts = datetime.datetime.now().strftime('_%Y%m%d_%H%M%S')
        query = "COPY (SELECT * FROM " + tableName + ") TO '/tmp/" + tableName + "_FL" + ts + ".csv'"
        cur.execute(query)
        ssh = paramiko.SSHClient()
        ssh.set_missing_host_key_policy(paramiko.AutoAddPolicy())
        ssh.connect('172.16.6.89', username='******', password='******')
        ftp = ssh.open_sftp()
        ftp.get("/tmp/" + tableName + "_FL" + ts + ".csv",
                "Gp/" + tableName + "_FL" + ts + ".csv")
        ftp.close()
        #Connect To hadoop
        client = InsecureClient('http://172.16.4.144:50070', user='******')
        client.delete("/user/root/greenplum/source/" + tableName, True)
        client.makedirs("/user/root/greenplum/source/" + tableName, "0777")
        client.upload(
            "/user/root/greenplum/source/" + tableName + "/",
            "F:/Srilatha/Attunity-POC/Greenplum/Gp/" + tableName + "_FL" + ts +
            ".csv")
        sql = "INSERT INTO control_table(table_name) VALUES(%s);"
        cur.execute(sql, (tableName, ))
        connection.commit()
    def delete_directory(self, directory_url):
        web_hdfs_url = Environment().get_web_hdfs_url()
        session = SwSessionManager().get_session()
        user_name = session.get_username()
        client = InsecureClient(web_hdfs_url, user_name)
        try:
            directory_name_with_path = urllib3.util.parse_url(
                directory_url).path
            logger.log_info(
                "Deleting the directory {}".format(directory_name_with_path))
            response = client.delete(directory_name_with_path, recursive=True)
            if not response:
                raise ServiceError("Directory {0} doesn't exist".format(
                    directory_name_with_path))
            return

        except Exception as e:
            raise ServiceError(
                "Deleting the folder from HDFS failed with the error: {0}".
                format(str(e)))
Example #17
0
def orchestrationPrediction():
    hdfs_client = InsecureClient('http://192.168.1.4:9870', user='******')

    hdfs_client.delete('/image_test', recursive=True)
    hdfs_client.delete('/image_test_crop', recursive=True)
    hdfs_client.delete('/image_test_ready', recursive=True)

    if request.files['picture'] is None:
        return json.dumps(None)

    picture = request.files['picture']

    with hdfs_client.write('/image_test/test.jpg') as writer:
        picture.save(writer)

    data = pd.DataFrame(['test.jpg'], columns=['Path'])
    with hdfs_client.write('/image_test/data.csv', encoding='utf-8') as writer:
        data.to_csv(writer, index_label='index')

    classifiers = request.form.getlist('classifiers')

    list_algo = []

    for algo in classifiers[0].split(','):

        if not (algo in list_algo_deep) and not (algo in list_algo_ml):
            return algo + ' is an incorrect algo.'
        list_algo.append(algo)

    orchPred = OrchestrationPrediction('test.jpg', list_algo)
    list_returns_predict = orchPred.run()

    data = {}
    data['returns_predictions'] = {}

    for res in list_returns_predict:
        data['returns_predictions'][list(res.keys())[0]] = res[list(
            res.keys())[0]]

    return json.dumps(data)
    def hadoop_load(self):

        #Dumping data from local file system to hadoop ecosystem
        client_hdfs = InsecureClient('http://localhost:50070', user="******")
        try:
            client_hdfs.upload('/', "/home/student/Pied Piper/ethfinal68.csv")
        except:
            client_hdfs.delete(hdfs_path='/' + 'ethfinal68.csv' + '/',
                               recursive=True)
            client_hdfs.upload('/', "/home/student/Pied Piper/ethfinal68.csv")

        try:
            client_hdfs.upload('/', "/home/student/Pied Piper/btcfinal68.csv")
        except:
            client_hdfs.delete(hdfs_path='/' + 'btcfinal68.csv' + '/',
                               recursive=True)
            client_hdfs.upload('/', "/home/student/Pied Piper/btcfinal68.csv")

        try:
            client_hdfs.upload('/', "/home/student/Pied Piper/ltcfinal68.csv")
        except:
            client_hdfs.delete(hdfs_path='/' + 'ltcfinal68.csv' + '/',
                               recursive=True)
            client_hdfs.upload('/', "/home/student/Pied Piper/ltcfinal68.csv")
Example #19
0
# -*- coding: utf-8 -*-
#
# Copyright © 2018 white <*****@*****.**>
#
# Distributed under terms of the MIT license.

"""
https://hdfscli.readthedocs.io/en/latest/api.html#module-hdfs.client
"""
from hdfs import InsecureClient

hdfs_url = "http://192.168.30.125:50070"
hdfs_user = "******"
c = InsecureClient(hdfs_url, user=hdfs_user)

c.write("/test_write", data="string")
c.delete("/test_write")
c.makedirs("/new/path") # 自动递归创建

with c.read("f.txt", encoding="utf-8") as f:
    content = f.read()

c.write("/test.txt", "test string")
Example #20
0
# import hdfs library to Python
from hdfs import InsecureClient

# log in hdfs server
client = InsecureClient('http://master32:50070', user='******')

# print all of the hdfs root folder
print client.list('/')

path = '/test/aaa.txt'

# Check if the file exists
if (client.content(path, strict=False) != None):
    client.delete(path)

print "START TO WRITE FILE"

# write a text file from hdfs
with client.write(path, encoding='utf-8') as writer:
    for i in range(10):
        writer.write("Hello World\n")

print "DONE"

print "START TO READ FILE"

# read a text file from hdfs
with client.read(path, chunk_size=8096) as reader:
    for chunk in reader:
        print chunk
Example #21
0
# ==== Writing Dataframe to HDFS =====
with client_hdfs.write('/user/hdfs/wiki/helloworld.csv',
                       encoding='utf-8') as writer:
    df.to_csv(writer)

# ====== Reading files ======
with client_hdfs.read('/user/hdfs/wiki/helloworld.csv',
                      encoding='utf-8') as reader:
    df = pd.read_csv(reader, index_col=0)

# ==== Getting Content Summary ====
client_hdfs.content('hdfs_path')

# ==== Remove a directory or File in HDFS ====
client_hdfs.delete('hdfs_path', recursive=False, skip_trash=True)

# ==== Create a Directory ====
client_hdfs.makedirs('hdfs_path', permission=None)

# ==== Upload FIle into HDFS ====
client_hdfs.upload('hdfs_path',
                   'local_path',
                   n_threads=1,
                   temp_dir=None,
                   chunk_size=65536,
                   progress=None,
                   cleanup=True,
                   overwrite=True)

# Source : https://hdfscli.readthedocs.io/en/latest/api.html#module-hdfs.client
Example #22
0
class HDFSStorage(Storage):
    """
    HDFS storage
    """

    def fix_slashes(self, path):
        sep = os.path.sep
        if path[0] != sep:
            path = sep + path
        if path[-1] != sep:
            path = path + sep
        return path

    def __init__(self, location=None, base_url=None):
        self.hdfs_hosts = settings.HDFS_STORAGE['hosts']
        self.hdfs_root = self.fix_slashes(settings.HDFS_STORAGE['root'])
        self.media_root = settings.MEDIA_ROOT
        self.media_url = self.fix_slashes(settings.MEDIA_URL)

        self.fetch_url = '%s/webhdfs/v1%s%%s?op=OPEN' % (self.hdfs_hosts.split(',')[0], self.hdfs_root)
        self.client = InsecureClient(self.hdfs_hosts)

    def _open(self, name, mode='rb'):
        local_path = os.path.join(settings.MEDIA_ROOT, name.replace('/', os.path.sep))
        if not os.path.exists(local_path):
            remote_path = self.path(name)
            local_dir = os.path.dirname(local_path)
            if not os.path.exists(local_dir):
                os.makedirs(local_dir)
            print(self.client.download(remote_path, local_path=local_path, overwrite=True,
                                       temp_dir=tempfile.gettempdir()))
        return File(open(local_path, mode))

    def _save(self, name, content):
        print("_save(%s, %s, %s)" % (self, name, content))
        local_path = content.name
        hdfs_path = self.path(name)  # os.path.basename(local_path))
        print(hdfs_path, local_path)
        self.client.write(hdfs_path, data=content, overwrite=True)
        return name

    def url(self, name):
        return self.fetch_url % name

    def delete(self, name):
        return self.client.delete(self.path(name))

    def listdir(self, path):
        file_list = []
        dir_list = []
        for name, status in self.client.list(self.path(path), status=True):
            if status['type'] == 'DIRECTORY':
                dir_list.append(name)
            elif status['type'] == 'FILE':
                file_list.append(name)
        return dir_list, file_list

    def size(self, name):
        return self.client.status(self.path(name))['length']

    def exists(self, name):
        try:
            return True if self.client.status(self.path(name)) else False
        except HdfsError:
            return False

    def path(self, name):
        return (self.hdfs_root + name).replace('\\', '/')
Example #23
0
    for line in f:
        temp = line

# 5. Append the local file RandomText.txt to the end of the HDFS file: /activity1/data/NotSoRandomText.txt
client.write(hdfs_path='/activity1/data/NotSoRandomText.txt',
             data=temp,
             append=True)

# 6. List the disk space used by the directory /activity1/data/
diskSpaceUsed = client.content('/activity1/data/', strict=True)
print(diskSpaceUsed['spaceConsumed'])

# 7. Put the local file MoreRandomText.txt into HDFS as the path: /activity1/data/MoreRandomText.txt
client.upload(hdfs_path='/activity1/data/', local_path='./MoreRandomText.txt')
print(client.list('/activity1/data'))

# 8. Recursively list the contents of the directory /activity1/
fnames = client.list('/activity1')

fpaths = [
    psp.join(dpath, fname) for dpath, _, fnames in client.walk('/activity1')
    for fname in fnames
]

print(fpaths)

# 9. Remove the directory /activity1/ and all files/directories underneath it
client.delete(hdfs_path='/activity1', recursive=True)
print(client.list('/'))

print('End')
Example #24
0
def remove_in_hdfs(hdfs_path):
    client = InsecureClient('http://quickstart.cloudera:50070', user='******')
    client.delete(hdfs_path, recursive=True)
Example #25
0
from hdfs import InsecureClient

hdfs_cli = InsecureClient('http://192.168.1.4:9870', user='******')

hdfs_cli.delete('/images', recursive=True)
hdfs_cli.delete('/images_augmented', recursive=True)
hdfs_cli.delete('/images_crop', recursive=True)
hdfs_cli.delete('/images_norm', recursive=True)
hdfs_cli.delete('/image_test', recursive=True)
hdfs_cli.delete('/image_test_crop', recursive=True)
hdfs_cli.delete('/image_test_ready', recursive=True)
hdfs_cli.delete('/algo_trained', recursive=True)

Example #26
0
from hdfs import InsecureClient
import os
client = InsecureClient("http://localhost:9870", user='******')
client.delete("streamInput/area", True)
client.makedirs("streamInput/area")
# os.removedirs('file')
Example #27
0
class HadoopFileSystem():
    def __init__(self, url, user):
        u = urlsplit(url)
        if u.scheme != 'http' and u.scheme != 'https':
            raise ValueError("Invalid name node address")

        self.url = urlunparse((u.scheme, u.netloc, '', '', '', ''))
        self.client = InsecureClient(self.url, user=user)
        self.localdir = u.path
        self.prefix = 'HDFS'

    def normalize_path(self, path):
        path = os.path.normpath(path)
        path = self.strip_prefix(path)
        while path and path[0] == os.sep:
            path = path[1:]
        return os.path.join(self.localdir, path)

    def strip_prefix(self, path):
        return path[len(self.prefix):] if path.startswith(
            self.prefix) else path

    def strip_root(self, path):
        path = self.strip_prefix(path)
        if path.startswith(self.url):
            path = path[len(self.url):]
            if not path.startswith(self.localdir):
                raise 'Invalid hdfs path. It must start with the root directory'
        return path[len(self.localdir):] if path.startswith(
            self.localdir) else path

    def create_folder(self, path):
        try:
            path = self.normalize_path(path)
            self.client.makedirs(path)
        except:
            return None
        return path

    def remove(self, path):
        try:
            path = self.normalize_path(path)
            if self.client.status(path, False) is not None:
                self.client.delete(path, True)
        except Exception as e:
            print(e)

    def rename(self, oldpath, newpath):
        try:
            oldpath = self.normalize_path(oldpath)
            newpath = self.normalize_path(newpath)
            self.client.rename(oldpath, newpath)
        except Exception as e:
            print(e)

    def get_files(self, path):
        path = self.normalize_path(path)
        files = []
        for f in self.client.list(path):
            status = self.client.status(join(path, f), False)
            if status['type'] != "DIRECTORY":
                files.append(f)
        return files

    def get_folders(self, path):
        path = self.normalize_path(path)
        folders = []
        for f in self.client.list(path):
            status = self.client.status(join(path, f), False)
            if status['type'] == "DIRECTORY":
                folders.append(f)
        return folders

    def exists(self, path):
        path = self.normalize_path(path)
        status = self.client.status(path, False)
        return not (status is None)

    def isdir(self, path):
        path = self.normalize_path(path)
        status = self.client.status(path, False)
        return status['type'] == "DIRECTORY"

    def isfile(self, path):
        path = self.normalize_path(path)
        status = self.client.status(path, False)
        return status['type'] == "FILE"

    def read(self, path):
        path = self.normalize_path(path)
        with self.client.read(path) as reader:
            return reader.read().decode('utf-8')

    def write(self, path, content):
        path = self.normalize_path(path)
        self.client.write(path, content)

    def make_json(self, path):
        normalized_path = self.normalize_path(path)
        data_json = {
            'path': urljoin(self.url, normalized_path),
            'text': os.path.basename(path)
        }
        status = self.client.status(normalized_path, False)

        if status is not None:
            data_json['folder'] = status['type'] == "DIRECTORY"
            if status['type'] == "DIRECTORY":
                data_json['nodes'] = [
                    self.make_json(os.path.join(path, fn))
                    for fn in self.client.list(normalized_path)
                ]
        #print(json.dumps(data_json))
        return data_json

    def save_upload(self, file, fullpath):
        localpath = os.path.join(tempfile.gettempdir(),
                                 os.path.basename(fullpath))
        if os.path.isfile(localpath):
            os.remove(localpath)
        try:
            file.save(localpath)
            if isfile(fullpath):
                fullpath = os.path.dirname(fullpath)
            self.client.upload(self.normalize_path(fullpath), localpath, True)
        except:
            pass

    def download(self, path):
        path = self.normalize_path(path)
        status = self.client.status(path, False)
        if status is not None and status['type'] == "FILE":
            localpath = os.path.join(tempfile.gettempdir(),
                                     os.path.basename(path))
            return self.client.download(path, localpath, True)
        else:
            return None
Example #28
0
class HDFSStorage(Storage):
    """
    HDFS storage
    """

    def fix_slashes(self, path):
        sep = os.path.sep
        if path[0] != sep:
            path = sep + path
        if path[-1] != sep:
            path = path + sep
        return path

    def __init__(self, location=None, base_url=None):
        self.hdfs_hosts = settings.HDFS_STORAGE['hosts']
        self.hdfs_root = self.fix_slashes(settings.HDFS_STORAGE['root'])
        self.media_root = settings.MEDIA_ROOT
        self.media_url = self.fix_slashes(settings.MEDIA_URL)

        self.fetch_url = '%s/webhdfs/v1%s%%s?op=OPEN' % (self.hdfs_hosts.split(',')[0], self.hdfs_root)
        self.client = InsecureClient(self.hdfs_hosts)

    def _open(self, name, mode='rb'):
        local_path = os.path.join(settings.MEDIA_ROOT, name.replace('/', os.path.sep))
        if not os.path.exists(local_path):
            remote_path = self.path(name)
            local_dir = os.path.dirname(local_path)
            if not os.path.exists(local_dir):
                os.mkdir(local_dir)
            print self.client.download(remote_path, local_path=local_path, overwrite=True,
                                       temp_dir=tempfile.gettempdir())
        return File(open(local_path, mode))

    def _save(self, name, content):
        print "_save(%s, %s, %s)" % (self, name, content)
        local_path = content.name
        hdfs_path = self.path(name)  # os.path.basename(local_path))
        print hdfs_path, local_path
        self.client.write(hdfs_path, data=content, overwrite=True)
        return name

    def url(self, name):
        return self.fetch_url % name

    def delete(self, name):
        return self.client.delete(self.path(name))

    def listdir(self, path):
        file_list = []
        dir_list = []
        for name, status in self.client.list(self.path(path), status=True):
            if status['type'] == 'DIRECTORY':
                dir_list.append(name)
            elif status['type'] == 'FILE':
                file_list.append(name)
        return dir_list, file_list

    def size(self, name):
        return self.client.status(self.path(name))['length']

    def exists(self, name):
        try:
            return True if self.client.status(self.path(name)) else False
        except HdfsError:
            return False

    def path(self, name):
        return (self.hdfs_root + name).replace('\\', '/')
Example #29
0
class HDFSLibrary:
    """
        Test library for working with HDFS
    """
    WEB_HDFS_URL = ""
    client = ""

    def __init__(self, namenode="localhost", port="50070"):
        self.WEB_HDFS_URL = 'http://' + namenode + ':' + str(port)
        print namenode, ">>", port, ">>", self.WEB_HDFS_URL
        self.client = InsecureClient(self.WEB_HDFS_URL)

    def check_hdfs_file_exists(self, file_path, stop=False):
        if None == self.client.status(file_path, strict=False):
            if stop:
                print "ERROR: Error: File does not exist: ", file_path
                return "ERROR: Error: File does not exist: ", file_path
                # exit(172)
            return False
        return True

    def get_hdfs_file_content(self, file_path):
        self.check_hdfs_file_exists(file_path, stop=True)
        data = ""
        with self.client.read(file_path) as reader:
            for line in reader:
                data += line
        return data

    def search_string_in_hdfs_file(self,
                                   file_path,
                                   text1,
                                   text2="aqwszx",
                                   text3="xzswqa"):
        ret = self.check_hdfs_file_exists(file_path, stop=True)
        found = "" if ret else ret
        with self.client.read(file_path) as reader:
            for line in reader:
                if line.find(text1) == -1 and line.find(
                        text2) == -1 and line.find(text3) == -1:
                    continue
                found += line
        return found

    def hdfs_file_should_not_contain(self,
                                     file_path,
                                     text1,
                                     text2="aqwszx",
                                     text3="xzswqa"):
        self.check_hdfs_file_exists(file_path, stop=True)
        with self.client.read(file_path) as reader:
            for line in reader:
                if line.find(text1) != -1 or line.find(
                        text2) != -1 or line.find(text3) != -1:
                    return False
        return True

    ########################
    # # BASIC FUNCTIONS: # #
    ########################
    def get_hdfs_file_folder_content_summary(self, file_path):
        """
        Retrieving a file or folder content summary.
        :return: returns a file or folder content summary.
        """
        self.check_hdfs_file_exists(file_path, stop=True)
        return self.client.content(file_path)

    def get_hdfs_file_folder_status(self, file_path):
        """
        Retrieving a file or folder status.
        :return: returns a file or folder status.
        """
        self.check_hdfs_file_exists(file_path, stop=True)
        return self.client.status(file_path)

    def list_hdfs_directory(self, folder_path):
        """
        Listing all files inside a directory.
        :return: returns a file list.
        """
        self.check_hdfs_file_exists(folder_path, stop=True)
        return self.client.list(folder_path)

    def move_hdfs_file(self, old_path, new_path):
        """
        Renaming ("moving") a file.
        :return: NA
        """
        self.check_hdfs_file_exists(old_path, stop=True)
        self.client.rename(old_path, new_path)

    def delete_hdfs_file(self, file_path):
        """
        Deleting a file or folder recursively.
        :return: returns `True` if the deletion was successful otherwise `False`
        """
        self.check_hdfs_file_exists(file_path)
        return self.client.delete(file_path, recursive=True)

    def copy_to_local_hdfs_file(self, hdfs_path, local_path):
        """
        Copy a file or folder from HDFS to local.
        :return: local_path
        """
        self.check_hdfs_file_exists(hdfs_path)
        return self.client.download(hdfs_path,
                                    local_path,
                                    overwrite=True,
                                    n_threads=4)

    def copy_from_local_hdfs_file(self, local_path, hdfs_path):
        """
        Copy a file or folder from local to HDFS.
        :return: hdfs_path
        """
        return self.client.upload(hdfs_path,
                                  local_path,
                                  overwrite=True,
                                  n_threads=4)

    def get_hdfs_file_checksum(self, file_path):
        """
        Get the checksum value for file
        :return: checksum
        """
        self.check_hdfs_file_exists(file_path, stop=True)
        return self.client.checksum(file_path)

    def create_hdfs_dir(self, dir_path, perm=755):
        """
        Create a directory or recursive dirs on HDFS
        :return: NA
        """
        self.client.makedirs(dir_path, permission=perm)
Example #30
0
import io
# For Data Lake
from hdfs import InsecureClient
# For Data Warehouse
from pyhive import hive

import pandas as pd

df_source = pd.read_csv(r'output/price.csv')

# Define HDFS interface
hdfs_interface = InsecureClient('http://localhost:50070')
hdfs_interface.list('/')

# Delete old data
hdfs_interface.delete('/wqd7005/raw_price', recursive=True, skip_trash=True)

# Create hdfs directories to store data
hdfs_interface.makedirs('/wqd7005')
hdfs_interface.makedirs('/wqd7005/raw_price')
hdfs_interface.list('/wqd7005')

# Write data to raw_price directory

# text buffer
s_buf = io.StringIO()
# saving a data frame to a buffer (same as with a regular file):
df_source.to_csv(s_buf, index=False, header=False)

hdfs_interface.write('/wqd7005/raw_price/000000_0',
                     data=s_buf.getvalue(),
Example #31
0
# Check if there is data for a prediction
client_hdfs = InsecureClient('http://awscdh6-ma.sap.local:9870', user='******')
hdfs_content = client_hdfs.list('/tmp/tbr/BARMER/XSA')
print(hdfs_content)
print()

if len(hdfs_content) > 0 and hdfs_content[0] == 'iris.csv':

    print('Starte Prediction')

    #Herkunft des R-Scripts
    source_path = 'https://github.com/JimKnopfSun/BARMER_XSA.git'

    #Ziel des R-Scripts auf XSA
    target_path = '/usr/sap/HN2/home/testdir/'

    #Leere alte Script-Downloads im XSA
    shutil.rmtree(path=target_path + "/BARMER_XSA",
                  ignore_errors=True,
                  onerror=None)

    #Lade R-Script nach XSA
    git_clone(source_path, target_path)

    #Führe R-Script aus
    r = robjects.r
    _ = r.source(target_path + "/BARMER_XSA/sample.R")

    # Remove Data from HDFS
    client_hdfs.delete("/tmp/tbr/BARMER/XSA/iris.csv")
Example #32
0
class HadoopFileSystem(object):
    def __init__(self, *opts):
        self.client = InsecureClient(current_app.config['WEBHDFS_ADDR'], user=current_app.config['WEBHDFS_USER'])
         
#     def make_tree(self, datasourceid, client, path):
#         tree = dict(name=(os.path.basename(path), datasourceid + os.path.sep + path), children=[])
#         try: lst = client.list(path, status=True)
#         except:
#             pass #ignore errors
#         else:
#             for fsitem in lst:
#                 fn = os.path.join(path, fsitem[0])
#                 if fsitem[1]['type'] == "DIRECTORY":
#                     tree['children'].append(make_hdfs_tree(datasourceid, client, fn))
#                 else:
#                     tree['children'].append({'name' : (fsitem[0], datasourceid + os.path.sep + fn), 'children' : []})
#         return tree

    def make_json(self, datasourceid, base, relative_path):
        path = os.path.join(base, relative_path)
        data_json = {'datasource': datasourceid, 'path': relative_path, 'name': os.path.basename(relative_path) }
        status = self.client.status(path, False)

        if status is not None:
            if status['type'] == "DIRECTORY":
                data_json['type'] = DataType.Folder
                data_json['children'] = [self.make_json(datasourceid, base, os.path.join(relative_path, fn)) for fn in self.client.list(path)]
            else:
                data_json['type'] = DataType.File
        #print(json.dumps(data_json))
        return data_json
    
    def makedirs(self, path):
        try: 
            self.client.makedirs(path)
        except:
            return None
        return path
    
    def delete(self, path):
        try: 
            if self.client.status(path, False) is not None:
                self.client.delete(path, True)
        except Exception as e: print(e)
        
    def addfolder(self, path):
        i = 0
        while self.client.status(os.path.join(path, "New Folder ({0})".format(i)), False) is None:
            i += 1
        return self.makedirs(os.path.join(path, "New Folder ({0})".format(i)))
    
    def rename(self, oldpath, newpath):
        try:
            self.client.rename(oldpath, newpath)
        except Exception as e: print(e)
    
    def saveUpload(self, file, fullpath):
        localpath = os.path.join(tempfile.gettempdir(), os.path.basename(fullpath))
        if os.path.isfile(localpath):
            os.remove(localpath)
        try:
            file.save(localpath)
            self.client.upload(os.path.dirname(fullpath), localpath, True)
        except:
            pass
        
    def download(self, fullpath):
        status = self.client.status(fullpath, False)
        if status is not None and status['type'] == "FILE":
            localpath = os.path.join(tempfile.gettempdir(), os.path.basename(fullpath))
            return self.client.download(fullpath, localpath, True)
        else:
            return None