Example #1
0
class SavedModelUploader(object):
    """upload a saved model to hadoop file system"""
    def __init__(self, url, user, base_path=""):
        self._logger = logging.getLogger(self.__class__.__name__)
        self._url = url
        self._user_ = user
        self._base_path = base_path
        self._client = InsecureClient(url, user)

        if not self._exist(base_path):
            self._mkdir(base_path)

    def _exist(self, path):
        if self._client.content(path, strict=False):
            return True
        else:
            return False

    def _mkdir(self, path):
        self._client.makedirs(path)

    def _del(self, path):
        self._client.delete(path, recursive=True)

    def _upload(self, local_path, hdfs_path):
        self._client.upload(hdfs_path, local_path)

    def _logging_progress(self, local_path, nbytes):
        msg = None
        if nbytes > 0:
            msg = "uploading: '{}' [{} bytes]".format(local_path, nbytes)
        else:
            msg = "uploading: '{}' [done]".format(local_path)
        self._logger.info(msg)

    def upload(self, local_model_path, overwrite=False):
        hdfs_model_path = self._base_path + '/' + basename(local_model_path)

        existed = self._exist(hdfs_model_path)
        if overwrite and existed:
            self._del(hdfs_model_path)
        elif not overwrite and existed:
            raise RuntimeError(
                "could not overwrite the model, already existed.")

        try:
            self._client.upload(self._base_path,
                                local_model_path,
                                progress=self._logging_progress)
        except HdfsError as e:
            self._logger.error(e)

        self._logger.info("model upload done")
Example #2
0
def upload_to_hdfs(input_dir, output_dir, chunk_size):
    # locate files in directory
    files = [
        os.path.abspath("{}/{}".format(input_dir, f))
        for f in listdir(input_dir) if isfile(join(input_dir, f))
    ]
    tmp_dir = "{}/tmp".format(input_dir)

    # setup temp dir
    if os.path.isdir(tmp_dir):
        shutil.rmtree(tmp_dir)
    os.mkdir(tmp_dir)

    # split files into 128mb chunks
    for f in files:
        fs = FileSplit(file=f,
                       splitsize=(chunk_size) * 1e6,
                       output_dir=tmp_dir)
        fs.split(callback=split_callback)

    # upload to hdfs
    hdfs_client = InsecureClient("http://{}:9870".format(
        settings.HDFS_HOST_VALUE),
                                 user=settings.HDFS_USER_VALUE)

    # delete existing output dir
    if hdfs_client.content(output_dir, strict=False) != None:
        hdfs_client.delete(output_dir, recursive=True)

    # upload files to tmp dir
    remote_path = hdfs_client.upload(hdfs_path="/tmp",
                                     local_path=tmp_dir,
                                     n_threads=-1,
                                     overwrite=True)

    # rename to output_dir
    hdfs_client.rename("/tmp", output_dir)

    print(
        "{} files uploaded to hdfs host '{}{}'  ({} file chunks total)".format(
            len(files),
            settings.HDFS_HOST_VALUE,
            output_dir,
            len(split_files),
        ))
    # delete temp files
    shutil.rmtree(tmp_dir)

    return hdfs_file_paths
Example #3
0
    def fileExists(server, filename):
        """ Checks is a file exists in hdfs.
        
            Args:
                server: hdfs server and port, example: "http://hadoop1:50070".
                filename: path of file to check.
                
            Returns:
                True: if file exists, False otherwise.
        """

        from hdfs import InsecureClient

        client = InsecureClient(server)
        return client.content(filename, strict=False) != None
Example #4
0
def init_context(context):
    key_count = defaultdict(int)
    setattr(context.user_data, "key_count", key_count)

    # init HDFS
    hdfs_host = os.environ.get("HDFS_HOST")
    hdfs_user = os.environ.get("HDFS_USER")
    output_path = os.environ.get("REDUCER_OUTPUT_FILENAME")
    hdfs_client = InsecureClient("http://{}:9870".format(hdfs_host),
                                 user=hdfs_user)

    # delete existing output file
    if hdfs_client.content(output_path, strict=False) != None:
        hdfs_client.delete(output_path)

    setattr(context.user_data, "hdfs_client", hdfs_client)
Example #5
0
def getPathLength(spath, host=None):
    from taskOpt.opt_utils import hadoop_host, hadoop_web_ui_port, hadoop_hdfs_port
    try:
        if host is None:
            host = "http://{hadoop_host}:{hadoop_web_ui_port}".format(
                hadoop_host=hadoop_host, hadoop_web_ui_port=hadoop_web_ui_port)
        client = InsecureClient(host)
        spath = spath.split("hdfs://{hadoop_host}:{hadoop_hdfs_port}".format(
            hadoop_host=hadoop_host, hadoop_hdfs_port=hadoop_hdfs_port))[-1]
        length = client.content(spath)['length']
        assert length is not None
        return length
    except Exception as e:
        print(Fore.RED +
              "erros occurs when accsing HDFS,check for the host:{host}".
              format(host=host))
        logging.exception(e)
        raise Exception
Example #6
0
# import hdfs library to Python
from hdfs import InsecureClient

# log in hdfs server
client = InsecureClient('http://master32:50070', user='******')

# print all of the hdfs root folder
print client.list('/')

path = '/test/aaa.txt'

# Check if the file exists
if (client.content(path, strict=False) != None):
    client.delete(path)

print "START TO WRITE FILE"

# write a text file from hdfs
with client.write(path, encoding='utf-8') as writer:
    for i in range(10):
        writer.write("Hello World\n")

print "DONE"

print "START TO READ FILE"

# read a text file from hdfs
with client.read(path, chunk_size=8096) as reader:
    for chunk in reader:
        print chunk
Example #7
0
liste_hello = ['hello1', 'hello2']
liste_world = ['world1', 'world2']
df = pd.DataFrame(data={'hello': liste_hello, 'world': liste_world})

# ==== Writing Dataframe to HDFS =====
with client_hdfs.write('/user/hdfs/wiki/helloworld.csv',
                       encoding='utf-8') as writer:
    df.to_csv(writer)

# ====== Reading files ======
with client_hdfs.read('/user/hdfs/wiki/helloworld.csv',
                      encoding='utf-8') as reader:
    df = pd.read_csv(reader, index_col=0)

# ==== Getting Content Summary ====
client_hdfs.content('hdfs_path')

# ==== Remove a directory or File in HDFS ====
client_hdfs.delete('hdfs_path', recursive=False, skip_trash=True)

# ==== Create a Directory ====
client_hdfs.makedirs('hdfs_path', permission=None)

# ==== Upload FIle into HDFS ====
client_hdfs.upload('hdfs_path',
                   'local_path',
                   n_threads=1,
                   temp_dir=None,
                   chunk_size=65536,
                   progress=None,
                   cleanup=True,
Example #8
0
class MasterCrawler:
    def __init__(self, url_ckan, redis_ip, redis_port):
        self.ckan = url_ckan
        self.r = redis.StrictRedis(host=redis_ip, port=redis_port, db=0)
        self.client = InsecureClient('http://cdh1:50070/',
                                     'admin',
                                     root='/user/admin/open_data')

    def formatUrl(self, url):
        urlSplit = url.rsplit('/', 1)
        urlEnd = urllib.quote(urlSplit[1])
        urlStart = urlSplit[0]
        finalUrl = urlStart + "/" + urlEnd
        return finalUrl

    def initializeRedis(self):
        content = self.client.content('dati_gov/dati_gov.json', strict=False)
        if not content:
            with self.client.write('dati_gov/dati_gov.json',
                                   encoding='utf-8') as writer:
                writer.write('')
        request = urllib2.Request(URL_DATI_GOV + "/api/3/action/package_list")
        response = urllib2.urlopen(request)
        assert response.code == 200
        response_dict = json.loads(response.read())
        # Check the contents of the response.
        assert response_dict['success'] is True
        result = response_dict['result']
        test_res = result  #[:2000]
        for res in test_res:
            print res
            self.r.rpush("dataset_id", res)

    def consumeData(self):
        red = self.r
        while (red.llen("dataset_id") != 0):
            dataset_id = red.lpop("dataset_id")
            encRes = urllib.urlencode(
                {"id": unicode(dataset_id).encode('utf-8')})
            request_info = urllib2.Request(URL_DATI_GOV +
                                           "/api/3/action/package_show?" +
                                           encRes)
            #request_info.add_header("Authorization", "Basic %s" % base64string)
            try:
                response_info = urllib2.urlopen(request_info)
                info_dataset = json.loads(response_info.read())
                results = info_dataset['result']
                info = results
                #print json.dumps(info)
                if 'resources' in info:
                    #print info
                    info["m_status_resources"] = "ok"
                    resources = info['resources']
                    name = info['name']
                    idInfo = info['id']
                    for resource in resources:
                        rUrl = resource['url']
                        rFormat = resource['format']
                        rName = resource['name']
                        rId = resource['id']
                        finalUrl = self.formatUrl(rUrl)
                        print finalUrl
                        rInfo = urllib2.Request(finalUrl)
                        try:
                            rReq = urllib2.urlopen(rInfo)
                            if rReq.code == 200:
                                resource["m_status"] = "ok"
                                if "csv" in rFormat.lower():
                                    print "qui passo"
                                    data = rReq.read()
                                    data_dir = "dati_gov/open_api/csv/" + dataset_id
                                    existDir = self.client.content(
                                        data_dir, strict=False)
                                    if not existDir:
                                        self.client.makedirs(data_dir)
                                    file_path = data_dir + "/" + rId + ".csv"
                                    #with self.client.write(file_path, encoding='utf-8') as writer:
                                    with self.client.write(
                                            file_path) as writer:
                                        writer.write(data)
                                if "json" in rFormat.lower():
                                    data = rReq.read()
                                    data_dir = "dati_gov/open_api/json/" + dataset_id
                                    existDir = self.client.content(
                                        data_dir, strict=False)
                                    if not existDir:
                                        self.client.makedirs(data_dir)
                                    file_path = data_dir + "/" + rId + ".json"
                                    #    with self.client.write(file_path, encoding='utf-8') as writer:
                                    with self.client.write(
                                            file_path) as writer:
                                        writer.write(data)
                            else:
                                resource["m_status"] = "ko"
                        except Exception, e:
                            resource["m_status"] = "ko"
                            print str(e)
                else:
                    print info
                    info["m_status_resources"] = "ko"
                    print "NO RESOURCES"
                with self.client.write('dati_gov/dati_gov.json',
                                       encoding='utf-8',
                                       append=True) as writer:
                    writer.write(json.dumps(info) + '\n')
            except Exception, e:
                print str(e)
                red.lpush("dataset_error", dataset_id)
# helper to print stats about a file/directory
# the prefix is the first printed permission,
# a 'd' for directory otherwise '-'
def printfile(name, stats, prefix='-'):
    print(' '.join((\
     perms(stats['permission'], prefix),\
     '  -' if stats['replication'] is 0 else '%3d' % stats['replication'],\
     stats['owner'],\
     stats['group'],\
     '%10d' % stats['length'],\
     datetime.fromtimestamp(stats['modificationTime'] / 1000).strftime('%Y-%m-%d %H:%M'),\
     name)))


# 1. Make a directory named: /activity1/
content = client.content('/activity1')
print(content)

# client.makedirs(hdfs_path='/activity1/', permission=None)
# client.makedirs(hdfs_path='/activity1/data/', permission=None)
# print(client.list(hdfs_path='/'))
# print('/activity1/data directory created')

# 2. Put the file RandomText.txt into HDFS as the path: /activity1/data/RandomText.txt
# client.upload(hdfs_path='/activity1/data/', local_path='/Workspace/cs6500_sp2021_r02_jeon/RandomText.txt')
# print('Uploaded')

# 3. List the contents of the directory /activity1/data/

# 4. Move the file /activity1/data/RandomText.txt to /activity1/data/NotSoRandomText.txt
Example #10
0
def getPathLength(spath, host='http://localhost:50070', user='******'):
    try:
        client = InsecureClient(host, user)
        return client.content(spath)['length']
    except:
        print "erros occurs when accsing HDFS,check for the host:", host, " and the user:", user
# coding: utf-8
"""
Copiar el log parseado en la sandbox
"""
import hdfs
from hdfs import InsecureClient

# apertura de sesion con WebHDFS
client = InsecureClient('http://sandbox.hortonworks.com:50070',
                        user='******',
                        timeout=1000)

# estos datos podrían pasarse como parámetros del programa
sourceFile = "datos/salida-log_2.txt"
destFile = "log_parser_apache"
sandboxPath = "/datos/logApache/"
dpath = sandboxPath + destFile
limite = 1000

content = client.content(sandboxPath)
print content
# apertura del fichero y preparación del parser con los parámetros que hayamos puestp
fo = open(sourceFile, 'r')

client.write(dpath, data=fo.read())
fo.close()
Example #12
0
# https://hdfscli.readthedocs.io/en/latest/quickstart.html
from hdfs import InsecureClient
client = InsecureClient(url='http://localhost:9870/', user='******')

# from hdfs import Config
# client = Config().get_client('dev')

# Retrieving a file or folder content summary.
content = client.content('/test_datas')

# Listing all files inside a directory.
fnames = client.list('/test_datas')

# Retrieving a file or folder status.
# status = client.status('/test_datas/sample1.txt')

# Renaming ("moving") a file.
# client.rename('/test_datas/sample1.txt', '/test_datas/sample5.txt')

# Deleting a file or folder.
# client.delete('/test_temps', recursive=True)

hdfspath = '/test_datas/'
localpath = '/Users/janevallette/Documents/Develops/learn_bigdata/datas/cat.jpeg'
result = client.upload(hdfspath, localpath)

# Writing part of a file.
# with open('datas/upfile.txt') as reader, client.write('/test_datas/upfile1.txt') as writer:
#   for line in reader:
#     # if line.startswith('-'):
#       writer.write(line)
Example #13
0
class HDFSLibrary:
    """
        Test library for working with HDFS
    """
    WEB_HDFS_URL = ""
    client = ""

    def __init__(self, namenode="localhost", port="50070"):
        self.WEB_HDFS_URL = 'http://' + namenode + ':' + str(port)
        print namenode, ">>", port, ">>", self.WEB_HDFS_URL
        self.client = InsecureClient(self.WEB_HDFS_URL)

    def check_hdfs_file_exists(self, file_path, stop=False):
        if None == self.client.status(file_path, strict=False):
            if stop:
                print "ERROR: Error: File does not exist: ", file_path
                return "ERROR: Error: File does not exist: ", file_path
                # exit(172)
            return False
        return True

    def get_hdfs_file_content(self, file_path):
        self.check_hdfs_file_exists(file_path, stop=True)
        data = ""
        with self.client.read(file_path) as reader:
            for line in reader:
                data += line
        return data

    def search_string_in_hdfs_file(self,
                                   file_path,
                                   text1,
                                   text2="aqwszx",
                                   text3="xzswqa"):
        ret = self.check_hdfs_file_exists(file_path, stop=True)
        found = "" if ret else ret
        with self.client.read(file_path) as reader:
            for line in reader:
                if line.find(text1) == -1 and line.find(
                        text2) == -1 and line.find(text3) == -1:
                    continue
                found += line
        return found

    def hdfs_file_should_not_contain(self,
                                     file_path,
                                     text1,
                                     text2="aqwszx",
                                     text3="xzswqa"):
        self.check_hdfs_file_exists(file_path, stop=True)
        with self.client.read(file_path) as reader:
            for line in reader:
                if line.find(text1) != -1 or line.find(
                        text2) != -1 or line.find(text3) != -1:
                    return False
        return True

    ########################
    # # BASIC FUNCTIONS: # #
    ########################
    def get_hdfs_file_folder_content_summary(self, file_path):
        """
        Retrieving a file or folder content summary.
        :return: returns a file or folder content summary.
        """
        self.check_hdfs_file_exists(file_path, stop=True)
        return self.client.content(file_path)

    def get_hdfs_file_folder_status(self, file_path):
        """
        Retrieving a file or folder status.
        :return: returns a file or folder status.
        """
        self.check_hdfs_file_exists(file_path, stop=True)
        return self.client.status(file_path)

    def list_hdfs_directory(self, folder_path):
        """
        Listing all files inside a directory.
        :return: returns a file list.
        """
        self.check_hdfs_file_exists(folder_path, stop=True)
        return self.client.list(folder_path)

    def move_hdfs_file(self, old_path, new_path):
        """
        Renaming ("moving") a file.
        :return: NA
        """
        self.check_hdfs_file_exists(old_path, stop=True)
        self.client.rename(old_path, new_path)

    def delete_hdfs_file(self, file_path):
        """
        Deleting a file or folder recursively.
        :return: returns `True` if the deletion was successful otherwise `False`
        """
        self.check_hdfs_file_exists(file_path)
        return self.client.delete(file_path, recursive=True)

    def copy_to_local_hdfs_file(self, hdfs_path, local_path):
        """
        Copy a file or folder from HDFS to local.
        :return: local_path
        """
        self.check_hdfs_file_exists(hdfs_path)
        return self.client.download(hdfs_path,
                                    local_path,
                                    overwrite=True,
                                    n_threads=4)

    def copy_from_local_hdfs_file(self, local_path, hdfs_path):
        """
        Copy a file or folder from local to HDFS.
        :return: hdfs_path
        """
        return self.client.upload(hdfs_path,
                                  local_path,
                                  overwrite=True,
                                  n_threads=4)

    def get_hdfs_file_checksum(self, file_path):
        """
        Get the checksum value for file
        :return: checksum
        """
        self.check_hdfs_file_exists(file_path, stop=True)
        return self.client.checksum(file_path)

    def create_hdfs_dir(self, dir_path, perm=755):
        """
        Create a directory or recursive dirs on HDFS
        :return: NA
        """
        self.client.makedirs(dir_path, permission=perm)
Example #14
0
# 4. Move the HDFS file /activity1/data/RandomText.txt to /activity1/data/NotSoRandomText.txt
client.rename('/activity1/data/RandomText.txt',
              '/activity1/data/NotSoRandomText.txt')

with open('./RandomText.txt', 'r') as f:
    for line in f:
        temp = line

# 5. Append the local file RandomText.txt to the end of the HDFS file: /activity1/data/NotSoRandomText.txt
client.write(hdfs_path='/activity1/data/NotSoRandomText.txt',
             data=temp,
             append=True)

# 6. List the disk space used by the directory /activity1/data/
diskSpaceUsed = client.content('/activity1/data/', strict=True)
print(diskSpaceUsed['spaceConsumed'])

# 7. Put the local file MoreRandomText.txt into HDFS as the path: /activity1/data/MoreRandomText.txt
client.upload(hdfs_path='/activity1/data/', local_path='./MoreRandomText.txt')
print(client.list('/activity1/data'))

# 8. Recursively list the contents of the directory /activity1/
fnames = client.list('/activity1')

fpaths = [
    psp.join(dpath, fname) for dpath, _, fnames in client.walk('/activity1')
    for fname in fnames
]

print(fpaths)
Example #15
0
     stop = time()
     print("Stop: " + str(stop))
     print("总耗时" + str(stop - start) + "秒")
 elif model == 'upload_delete':
     start = time()
     print("Start: " + str(start))
     if False:
         do_foreach_file('32652(copy)/5104', createtif)
     upload_tif(client_hdfs)
     stop = time()
     print("Stop: " + str(stop))
     print("总耗时" + str(stop - start) + "秒")
 elif model == 'upload_download':
     #do_foreach_file('32652(copy)/5104', createtif)
     #client_hdfs.delete('/gf1', recursive  = True)
     if client_hdfs.content('/gf1', False) == None:
         client_hdfs.upload('/gf1', '32652(copy)')
     start = time()
     print("Start: " + str(start))
     download_tif(client_hdfs)
     stop = time()
     print("Stop: " + str(stop))
     print("总耗时" + str(stop - start) + "秒")
 elif model == 'rows_download':
     if client_hdfs.content('/gf1', False) == None:
         client_hdfs.upload('/gf1', '32652(copy)')
     start = time()
     print("Start: " + str(start))
     rows_download_tif(client_hdfs)
     stop = time()
     print("Stop: " + str(stop))
Example #16
0
from hdfs import InsecureClient


client = InsecureClient('http://manager.novalocal:50070', user='******')


# Посмотрим, что у нас есть в рабочей директории
print(client.list('/student9_7'))
'''
['cur_readme', 'googlobots.txt', 'py_dir_02', 'readme', 'test', 'test2', 'testdir']
'''


# Посмотрим размер нашей рабочей директории
print(client.content('/student9_7'))
'''
{'directoryCount': 3, 'fileCount': 5, 'length': 10552, 'quota': -1, 'spaceConsumed': 31637, 'spaceQuota': -1}
'''


# Прочитаем файл `test`
with client.read('/student9_7/test') as reader:
  test = reader.read()
print(test)
'''
b'test file for hdfs\n'
'''


# Скопируем файл `test` из хранилища в локальную домашнюю директорию под именем `downloaded_file_via_py3`