Example #1
0
def main():

    client = InsecureClient(f'http://127.0.0.1:50070/', user='******')

    # create directory in HDFS
    client.makedirs('/test')

    #list content
    ll = client.list('/')
    print(ll)

    # create file in HDFS
    data = [{
        "name": "Anne",
        "salary": 10000
    }, {
        "name": "Victor",
        "salary": 9500
    }]
    with client.write('/test/sample_file.json',
                      encoding='utf-8') as json_file_in_hdfs:
        json.dump(data, json_file_in_hdfs)
    # OR
    client.write(os.path.join('/', 'test', 'sample_file2.json'),
                 data=json.dumps(data),
                 encoding='utf-8')

    # download file from HDFS
    client.download('/test/sample_file.json', './file_from_hadoop.json')

    # upload file to HDFS
    client.upload('/test/local_file_in_hadoop.json', './file_from_hadoop.json')
 def download_directory(self, directory_url):
     '''Downloads directory from remote HDFS to local, archives it and
     returns the zip of the directory'''
     logger.log_info("Downloading the directory {0} ".format(directory_url))
     # Remove the base url from the absolute directory path provided as parameter
     # For example, if the absolute path is hdfs://alpha:9000/configuration/12345/drift,
     # the below statement will return /configuration/12345/drift
     directory_name_with_path = urllib3.util.parse_url(directory_url).path
     directory_name = os.path.split(directory_name_with_path)[1]
     web_hdfs_url = Environment().get_web_hdfs_url()
     session = SwSessionManager().get_session()
     user_name = session.get_username()
     client = InsecureClient(web_hdfs_url, user_name)
     try:
         with tempfile.TemporaryDirectory() as temp:
             client.download(hdfs_path=directory_name_with_path,
                             local_path=temp,
                             n_threads=5)
             tmp_archive = os.path.join(temp)
             data = io.BytesIO()
             with open(shutil.make_archive(tmp_archive, 'gztar', temp),
                       "rb") as output_data:
                 data.write(output_data.read())
             data.seek(0)
         return send_file(data,
                          as_attachment=True,
                          attachment_filename=directory_name + ".tar.gz")
     except Exception as e:
         raise ServiceError(
             "Downloading the folder from HDFS failed with the error: {0}".
             format(str(e)))
Example #3
0
class Prediction_ML():
    def __init__(self, dir_algo, algo, path_img):
        logging.info('prediction_ML.init')
        self.directory_algo = dir_algo
        self.path_img = path_img
        self.algo = algo

        self.hdfs_client = InsecureClient('http://192.168.1.4:9870',
                                          user='******')
        self.image = self.read_image(self.path_img, 240)

    def read_image(self, path_img, img_size=0):
        logging.info('prediction_ML.read_image')
        img = 0

        try:
            with self.hdfs_client.read(path_img) as reader:
                img = Image.open(reader)
            if img_size != 0:
                img = img.resize((img_size, img_size))
            img = img.convert('L').convert('RGB')
            img = np.asarray(img).flatten()

        except IOError as err:
            logging.error("Error reading image or path")
            logging.error(err)

        except Exception as err:
            logging.error("Unkownown error in read_image")
            logging.error(err)

        return img

    def run(self):
        try:
            self.hdfs_client.download(
                self.directory_algo + self.algo + ".model",
                self.algo + ".model")
            model = joblib.load(self.algo + ".model")
            os.remove(self.algo + ".model")
            label = model.predict([self.image])
            try:
                array_proba = model.predict_proba([self.image])[0]
                proba = array_proba[label[0]]
            except:
                proba = -1

            return label[0], proba

        except IOError as err:
            logging.error('Error model ' + str(self.algo) +
                          ' is not trained yet!')
            logging.error(
                'Train this model first before using it for predictions')
            return -1, 1
Example #4
0
class HDFSStorage(Storage):
    def __init__(self, bucket_name: str, folder_name: str):
        super().__init__(bucket_name, folder_name)
        self.client = InsecureClient(url=settings.HDFS_CONN,
                                     user=settings.HDFS_USERNAME)

    def setup(self) -> HDFSResource:
        super().setup()

        self.client.makedirs(f"{self.bucket_name}/{self.folder_name}")

        return HDFSResource(
            resource=f"hdfs:/{self.bucket_name}/{self.folder_name}/")

    def put_file(self,
                 file_path: Union[str, Path],
                 rename: Optional[str] = None) -> HDFSResource:
        if isinstance(file_path, Path):
            file_path = str(file_path)

        file_name = Path(file_path).name if not rename else rename

        # copy file to task directory
        if not file_path.startswith(str(self.local_dir)):
            file_path = shutil.copy(file_path, Path(self.local_dir, file_name))

        try:
            self.client.upload(
                f"{self.bucket_name}/{self.folder_name}/{file_name}",
                file_path)
        except (gaierror, NewConnectionError):
            raise

        return HDFSResource(
            resource=f"hdfs:/{self.bucket_name}/{self.folder_name}/{file_name}"
        )

    def get_file(self, data_file: str) -> str:
        if not data_file.startswith("hdfs:"):
            raise NotValidScheme(
                "Object file prefix is invalid: expected `hdfs:`")

        _, bucket_name, folder_name, file_name = data_file.split("/")
        file_path = Path(self.temp_dir, bucket_name, folder_name, file_name)

        if not file_path.is_file():
            try:
                self.client.download(data_file, file_path)
            except Exception as err:
                print(err)

        return str(file_path)

    def remove_remote_dir(self, omit_files: List[str] = None) -> None:
        pass
Example #5
0
    def download(self, remote_location, local_filepath):
        url = urlparse(remote_location)

        if not url.hostname:
            raise ValueError('Hostname was not found in provided URL: %s' %
                             remote_location)

        connect_url = self._build_namenode_url(url.hostname, url.port)
        hdfs_path = url.path

        hdfs = InsecureClient(connect_url)
        hdfs.download(hdfs_path, local_filepath)
Example #6
0
class Storage:
    def __init__(self, protocol: str = 'webHDFS', *args, **kwargs):
        self.protocol, self.client = protocol.lower(), None
        if protocol.lower() == 'webHDFS'.lower():
            from hdfs import InsecureClient
            self.client = InsecureClient(*args, **kwargs)
            for f in 'upload download list status delete'.split():
                setattr(self, f, getattr(self,
                                         '%s_%s' % (f, protocol.lower())))

    def upload_webhdfs(self, local_path: str, remote_path: str, **kwargs):
        to_screen("upload %s -> %s" % (local_path, remote_path))
        return self.client.upload(local_path=local_path,
                                  hdfs_path=remote_path,
                                  **kwargs)

    def download_webhdfs(self, remote_path: str, local_path: str, **kwargs):
        mkdir_for(local_path)
        to_screen("download %s -> %s" % (remote_path, local_path))
        return self.client.download(local_path=local_path,
                                    hdfs_path=remote_path,
                                    overwrite=True,
                                    **kwargs)

    def list_webhdfs(self, remote_path: str, **kwargs):
        return self.client.list(hdfs_path=remote_path, **kwargs)

    def status_webhdfs(self, remote_path: str, **kwargs):
        return self.client.status(hdfs_path=remote_path, **kwargs)

    def delete_webhdfs(self, remote_path: str, **kwargs):
        return self.client.delete(hdfs_path=remote_path, **kwargs)
Example #7
0
def download_data(request):
    try:
        data_id = request.GET.get('data_id')
        user_id = request.GET.get('user_id')
        data_path = data_id + '.csv'

        fetched = Datasets.objects.filter(data_id=data_id,
                                          user_id=user_id).values(
                                              'hdfs_path', 'data_name')
        if len(fetched) == 0:
            raise Exception('Oops! No access!')
        if list(fetched)[0]['hdfs_path']:
            client = InsecureClient("http://hdfs.neurolearn.com:50070",
                                    user="******")
            client.download(list(fetched)[0]['hdfs_path'] + '/' +
                            list(fetched)[0]['data_name'],
                            data_path,
                            overwrite=True)
        else:
            data_cont_query = Datasets.objects.filter(
                data_id=data_id, user_id=user_id).values('data_cont')
            if len(data_cont_query) == 0:
                raise Exception('Oops! No access!')
            data_cont = list(data_cont_query)[0]['data_cont']
            pd.read_json(data_cont).to_csv(data_path, index=False)

        data_file = open(data_path, 'rb')

        response = FileResponse(data_file)
        response['Content-Type'] = 'application/octet-stream'
        response[
            'Content-Disposition'] = 'attachment;filename=\"' + data_id + '.csv\"'
    except Exception as e:
        traceback.print_exc()
        response_content = {}
        response = HttpResponse()
        response_content['msg'] = str(e)
        response_content['error_num'] = 1
        response.write(json.dumps(response_content))

    return response
Example #8
0
def config():
    # Read params from Json
    if not request.json or not 'preprocess' in request.json or not 'requirements' in request.json or not 'model' in request.json:
        abort(400)
    preprocess_file = request.json['preprocess']
    requirements_file = request.json['requirements']
    model_file = request.json['model']
    hdfs_uri = request.json['hdfs_uri']
    logger.info('Read json configurations: OK!!')

    # Download files from HDFS
    client_hdfs = InsecureClient(hdfs_uri)
    client_hdfs.download(requirements_file,
                         "./requirements.txt",
                         overwrite=True)
    client_hdfs.download(model_file, "./model.pickle", overwrite=True)
    client_hdfs.download(preprocess_file,
                         "./preprocess.pickle",
                         overwrite=True)
    logger.info('Download pickles: OK!!')

    # Install library dependencies
    subprocess.call("pip install -r ./requirements.txt", shell=True)
    current_app.hdfs_uri = hdfs_uri
    current_app.inMemory = False
    current_app.configured = True
    current_app.client_hdfs = client_hdfs
    return jsonify({'status': "Docker configured"}), 201
def download_file(path, test_case_number, task_number):
    try:
        client = InsecureClient(
            ('http://' + HADOOP_HOST_NAME + ':' + HADOOP_NAMENODE_PORT_NUMBER),
            user=HADOOP_USER_NAME)
    except:
        print("Error connecting to hdfs client")
        return
    try:
        client.download(
            HADOOP_OUTPUT_PATH + task_number + test_case_number + "/",
            os.path.join(path, test_case_number))
    except Exception as e:
        print(e)
        print("Error downloading output file from hdfs")
        return
    try:
        client.delete(HADOOP_OUTPUT_PATH + task_number + test_case_number,
                      recursive=True)
    except:
        print("Error deleting hdfs output directory")
        return
Example #10
0
class HdfsDownloader():
    FILENAME = __file__

    def init(self, cfg):
        url = cfg['source_url']
        user = cfg['user'] if 'user' in cfg else None
        root_path = cfg['root'] if 'root' in cfg else '/'
        if 'download_filename' not in cfg:
            cfg['download_filename'] = basename(url)

        http_protocal_prefix = 'http://'
        if url.startswith(http_protocal_prefix):
            index = url[len(http_protocal_prefix):].find('/') + len(
                http_protocal_prefix)
            host_port = url[:index]
            cfg['file_path'] = url[index:]
            self.hdfs_client = InsecureClient(url=host_port,
                                              user=user,
                                              root=root_path)
        return cfg

    def retrive_file(self, cfg):
        url = cfg['source_url']
        print("download hdfs file: {}".format(url))

        if url.startswith('http://'):
            path = cfg['file_path']
            self.hdfs_client.download(path,
                                      cfg['download_filename'],
                                      overwrite=True)
        elif url.startswith('hdfs://'):
            cmd = 'hadoop fs -get {}'.format(url)
            for out_ in run_command(cmd):
                print(out_.rstrip().decode('utf-8'))
        else:
            raise Exception(
                'Not supported protocal. Only support "http://" or "hdfs://"')

        return basename(url)
Example #11
0
 def get(self,period):
             
     print("Period to predict : ",period)
     
     # Connexion au client HDFS
     client = InsecureClient(url='http://namenode:9870', user='******')
     
     # Vérification de la présence du modèle sauvegardé sur HDFS
     if client.status(model_hdfs_remote_path + model_name , strict=False) != None:
         
         # load model
         client.download(model_hdfs_remote_path+model_name, model_local_path, overwrite=True)
         model_fit = ARIMAResults.load(model_local_path + model_name)
  
         # Dataset pour l'évaluation
         df = get_data_cassandra()
         print(df.head())
         X = df['total_estimated_load'].values
         
         start_index = len(X)
         end_index = start_index + int(period)
         forecast = model_fit.predict(start=start_index, end=end_index)
         
         #df['date_est_load'] = df['date_est_load'].apply(pd.Timestamp)
         day = df['date_est_load'].values[-1].date()
         print(day)
         print(type(day))
         day += datetime.timedelta(days=1)
         
         res = {}
         for yhat in forecast:
             res[day.strftime("%d/%m/%Y")] = yhat
             day += datetime.timedelta(days=1)
         
         return res
 
 
     return "Service has been stopped"
Example #12
0
# client.delete('/test_temps', recursive=True)

hdfspath = '/test_datas/'
localpath = '/Users/janevallette/Documents/Develops/learn_bigdata/datas/cat.jpeg'
result = client.upload(hdfspath, localpath)

# Writing part of a file.
# with open('datas/upfile.txt') as reader, client.write('/test_datas/upfile1.txt') as writer:
#   for line in reader:
#     # if line.startswith('-'):
#       writer.write(line)

from json import dump
# Writing a serialized JSON object.
# with open('datas/model.json') as reader, client.write('/test_datas/model1.json') as writer:
#   dump(reader, writer)

# with open('datas/cat.jpeg') as reader, client.write('/test_files/cat1.jpeg') as writer:
#   dump(reader, writer)

# Download a file or folder locally.
client.download('/test_datas/', 'datas/', n_threads=5)

# Loading a file in memory.
# with client.read('/test_datas/sample2.txt') as reader:
#   sample1 = reader.read()

# # Directly deserializing a JSON object.
# with client.read('/test_datas/model1.json', encoding='utf-8') as reader:
#   from json import load
#   model = load(reader)
Example #13
0
def Home(request):
    client = InsecureClient('http://localhost:50070', user='******')
    if not os.path.exists('1989.csv'):
        client.download('1989.csv', '1989.csv')
    if not os.path.exists('1991.csv'):
        client.download('1991.csv', '1991.csv')
    if not os.path.exists('1996.csv'):
        client.download('1996.csv', '1996.csv')
    if not os.path.exists('1998.csv'):
        client.download('1998.csv', '1998.csv')
    if not os.path.exists('1999.csv'):
        client.download('1999.csv', '1999.csv')
    if not os.path.exists('2004.csv'):
        client.download('2004.csv', '2004.csv')
    if not os.path.exists('2009.csv'):
        client.download('2009.csv', '2009.csv')
    if not os.path.exists('2014.csv'):
        client.download('2014.csv', '2014.csv')
    if not os.path.exists('Candidate.csv'):
        client.download('Candidate.csv', 'Candidate.csv')
    return render(request, 'election/home.html')
Example #14
0
from hdfs import InsecureClient
import os
client = InsecureClient("http://10.150.144.225:50070", user="******")
client.download("/SC_recommendation/caml/Electronics_new_strategy_new_2",
                "data/",
                overwrite=True)
print(os.listdir("data/"))
Example #15
0
class HisiHdfs:
    def __init__(self):
        self._c = InsecureClient(url="http://{}:14000".format(
            HisiHdfs.get_host()),
                                 user='******',
                                 root="/")
        # self._c = InsecureClient(url="http://10.154.67.254:14000", user='******', root="/")

    @staticmethod
    def get_host():
        domain = 'hdfs-ngx1.turing-ci.hisilicon.com'
        try:
            socket.gethostbyname(domain)
            return domain
        except Exception as e:
            return '10.154.67.254'

    @staticmethod
    def build_month_path(build_scene):
        '''daily build path'''
        return '/compilepackage/CI_Version/{}/br_hisi_trunk_ai/{}'.\
            format(build_scene, datetime.datetime.today().strftime('%Y%m'))

    @staticmethod
    def prebuild_month_path(build_scene):
        '''compile path'''
        return '/compilepackage/CI_Version/{}/br_hisi_trunk_ai_PRE_COMPILE/{}'.\
            format(build_scene, datetime.datetime.today().strftime('%Y%m'))

    def find_newest_build(self, build_scene):
        builds = self._c.list(HisiHdfs.build_month_path(build_scene), True)
        newest_build_name = None
        for build in builds:
            if type(build) != tuple:
                logging.warning("Unexpected build format {}".format(build))
                continue
            if len(build) < 2:
                logging.warning("Unexpected build format {}".format(build))
                continue
            if type(build[1]) != dict:
                logging.warning("Unexpected build format[1] {}".format(build))
                continue
            if build[1].get('type', None) != "DIRECTORY":
                logging.warning(
                    "Found unexpected build type(not DIRECTORY) {}".format(
                        build))
                continue
            if type(build[0]) != str:
                logging.warning("Unexpected build format[0] {}".format(build))
                continue
            elements = build[0].split('_')
            if len(elements) != 3:
                logging.warning("Unexpected build name {}".format(build))
                continue
            if elements[2] != "newest":
                continue
            # build_date = datetime.datetime.strptime('_'.join(elements[:2]), "%Y%m%d_%H%M%S%f")
            if newest_build_name is None:
                newest_build_name = build[0]
                continue
            if newest_build_name < build[0]:
                newest_build_name = build[0]
        return newest_build_name

    def path_exists(self, base_path: str, build_name: str):
        path = "{}/{}".format(base_path, build_name)
        return self._c.status(path, strict=False) is not None

    def find_package(self,
                     base_path: str,
                     build_name: str,
                     package_type: PackageType,
                     os_type=None,
                     arch=None):
        if os_type is None:
            os_type, arch = get_env()
        path = "{}/{}".format(base_path, build_name)
        packages = self._c.list(path, True)
        pr = package_type.get_name_re()
        for package_name, package_info in packages:
            pm = pr.match(package_name)
            if pm is not None:
                if OsType.analyse_os(pm.group('os')) == os_type and pm.group(
                        'arch') == arch:
                    return package_name
        return None

    def download_package(self, base_path: str, build_name: str,
                         package_name: str, local_path: str):
        return self._c.download(hdfs_path="{}/{}/{}".format(
            base_path, build_name, package_name),
                                local_path=local_path,
                                overwrite=True)

    def download_compile_package(self, build_scene: str, build_name: str,
                                 package_name: str, local_path: str):
        return self.download_package(HisiHdfs.prebuild_month_path(build_scene),
                                     build_name, package_name, local_path)

    def download_daily_package(self, build_scene: str, build_name: str,
                               package_name: str, local_path: str):
        return self.download_package(HisiHdfs.build_month_path(build_scene),
                                     build_name, package_name, local_path)

    def download_newest(self,
                        local_path: str,
                        packages: List[PackageType],
                        os_type=None,
                        arch=None):
        if not os.path.isdir(local_path):
            raise FileNotFoundError(
                "The path {} does not exists".format(local_path))
        if os_type is None:
            os_type, arch = get_env()

        build_scenes_to_build_name = {}
        package_names = []
        print("Begin to download newest run packages from the newest")
        for package in packages:
            build_scene = package.get_build_scene()
            newest_build_name = build_scenes_to_build_name.get(
                build_scene, self.find_newest_build(build_scene))
            if newest_build_name is None:
                logging.error("Can not find the newest build")
                raise Exception("Can not find the newest build")
            package_name = self.find_package(
                HisiHdfs.build_month_path(build_scene), newest_build_name,
                package, os_type, arch)
            if package_name is None:
                logging.error(
                    "Can not find the package {}, os {}, arch {}".format(
                        package, os_type, arch))
                raise Exception("Can not find package")
            with shell_printer.DotPrinter(
                    "Begin to download {} from {} to {}".format(
                        package_name, newest_build_name, local_path)):
                self.download_daily_package(build_scene, newest_build_name,
                                            package_name, local_path)
            logging.info("Download {} to {} successfully".format(
                package_name, local_path))
            package_names.append(package_name)
        return package_names

    def download_compile_packages(self, build_name: str, local_path: str,
                                  package_types: List[PackageType]):
        self.wait_compile_paths_ready(package_types, build_name)
        package_names = []
        for package_type in package_types:
            package_name = self.find_package(
                HisiHdfs.prebuild_month_path(package_type.get_build_scene()),
                build_name, package_type)
            if package_name is None:
                with shell_printer.DotPrinter("Wait package {} from {}".format(
                        package_type.name, build_name)):
                    while package_name is None:
                        logging.debug(
                            "Can not find package {} from {}, sleep".format(
                                package_type.name, build_name))
                        time.sleep(10)
                        package_name = self.find_package(
                            HisiHdfs.prebuild_month_path(
                                package_type.get_build_scene()), build_name,
                            package_type)
                    # 实测来看,刚创建好的文件直接下载可能有问题(下载失败,或者下载文件不完整),这里等5秒钟再下载
                    time.sleep(5)

            with shell_printer.DotPrinter("Begin to download {} to {}".format(
                    package_name, local_path)):
                self.download_compile_package(package_type.get_build_scene(),
                                              build_name, package_name,
                                              local_path)
            logging.info("Download {} to {} successfully".format(
                package_name, local_path))
            package_names.append(package_name)
        return package_names

    def wait_compile_paths_ready(self, package_types: List[PackageType],
                                 build_name: str):
        scenes = set([pt.get_build_scene() for pt in package_types])
        for build_scene in scenes:
            build_path = HisiHdfs.prebuild_month_path(build_scene)
            if not self.path_exists(build_path, build_name):
                with shell_printer.DotPrinter(
                        "The build({}) path({}) has not been created, wait".
                        format(build_name, build_path)):
                    while not self.path_exists(build_path, build_name):
                        time.sleep(1)
Example #16
0
class HDFSLibrary:
    """
        Test library for working with HDFS
    """
    WEB_HDFS_URL = ""
    client = ""

    def __init__(self, namenode="localhost", port="50070"):
        self.WEB_HDFS_URL = 'http://' + namenode + ':' + str(port)
        print namenode, ">>", port, ">>", self.WEB_HDFS_URL
        self.client = InsecureClient(self.WEB_HDFS_URL)

    def check_hdfs_file_exists(self, file_path, stop=False):
        if None == self.client.status(file_path, strict=False):
            if stop:
                print "ERROR: Error: File does not exist: ", file_path
                return "ERROR: Error: File does not exist: ", file_path
                # exit(172)
            return False
        return True

    def get_hdfs_file_content(self, file_path):
        self.check_hdfs_file_exists(file_path, stop=True)
        data = ""
        with self.client.read(file_path) as reader:
            for line in reader:
                data += line
        return data

    def search_string_in_hdfs_file(self,
                                   file_path,
                                   text1,
                                   text2="aqwszx",
                                   text3="xzswqa"):
        ret = self.check_hdfs_file_exists(file_path, stop=True)
        found = "" if ret else ret
        with self.client.read(file_path) as reader:
            for line in reader:
                if line.find(text1) == -1 and line.find(
                        text2) == -1 and line.find(text3) == -1:
                    continue
                found += line
        return found

    def hdfs_file_should_not_contain(self,
                                     file_path,
                                     text1,
                                     text2="aqwszx",
                                     text3="xzswqa"):
        self.check_hdfs_file_exists(file_path, stop=True)
        with self.client.read(file_path) as reader:
            for line in reader:
                if line.find(text1) != -1 or line.find(
                        text2) != -1 or line.find(text3) != -1:
                    return False
        return True

    ########################
    # # BASIC FUNCTIONS: # #
    ########################
    def get_hdfs_file_folder_content_summary(self, file_path):
        """
        Retrieving a file or folder content summary.
        :return: returns a file or folder content summary.
        """
        self.check_hdfs_file_exists(file_path, stop=True)
        return self.client.content(file_path)

    def get_hdfs_file_folder_status(self, file_path):
        """
        Retrieving a file or folder status.
        :return: returns a file or folder status.
        """
        self.check_hdfs_file_exists(file_path, stop=True)
        return self.client.status(file_path)

    def list_hdfs_directory(self, folder_path):
        """
        Listing all files inside a directory.
        :return: returns a file list.
        """
        self.check_hdfs_file_exists(folder_path, stop=True)
        return self.client.list(folder_path)

    def move_hdfs_file(self, old_path, new_path):
        """
        Renaming ("moving") a file.
        :return: NA
        """
        self.check_hdfs_file_exists(old_path, stop=True)
        self.client.rename(old_path, new_path)

    def delete_hdfs_file(self, file_path):
        """
        Deleting a file or folder recursively.
        :return: returns `True` if the deletion was successful otherwise `False`
        """
        self.check_hdfs_file_exists(file_path)
        return self.client.delete(file_path, recursive=True)

    def copy_to_local_hdfs_file(self, hdfs_path, local_path):
        """
        Copy a file or folder from HDFS to local.
        :return: local_path
        """
        self.check_hdfs_file_exists(hdfs_path)
        return self.client.download(hdfs_path,
                                    local_path,
                                    overwrite=True,
                                    n_threads=4)

    def copy_from_local_hdfs_file(self, local_path, hdfs_path):
        """
        Copy a file or folder from local to HDFS.
        :return: hdfs_path
        """
        return self.client.upload(hdfs_path,
                                  local_path,
                                  overwrite=True,
                                  n_threads=4)

    def get_hdfs_file_checksum(self, file_path):
        """
        Get the checksum value for file
        :return: checksum
        """
        self.check_hdfs_file_exists(file_path, stop=True)
        return self.client.checksum(file_path)

    def create_hdfs_dir(self, dir_path, perm=755):
        """
        Create a directory or recursive dirs on HDFS
        :return: NA
        """
        self.client.makedirs(dir_path, permission=perm)
Example #17
0
class HadoopFileSystem():
    def __init__(self, url, user):
        u = urlsplit(url)
        if u.scheme != 'http' and u.scheme != 'https':
            raise ValueError("Invalid name node address")

        self.url = urlunparse((u.scheme, u.netloc, '', '', '', ''))
        self.client = InsecureClient(self.url, user=user)
        self.localdir = u.path
        self.prefix = 'HDFS'

    def normalize_path(self, path):
        path = os.path.normpath(path)
        path = self.strip_prefix(path)
        while path and path[0] == os.sep:
            path = path[1:]
        return os.path.join(self.localdir, path)

    def strip_prefix(self, path):
        return path[len(self.prefix):] if path.startswith(
            self.prefix) else path

    def strip_root(self, path):
        path = self.strip_prefix(path)
        if path.startswith(self.url):
            path = path[len(self.url):]
            if not path.startswith(self.localdir):
                raise 'Invalid hdfs path. It must start with the root directory'
        return path[len(self.localdir):] if path.startswith(
            self.localdir) else path

    def create_folder(self, path):
        try:
            path = self.normalize_path(path)
            self.client.makedirs(path)
        except:
            return None
        return path

    def remove(self, path):
        try:
            path = self.normalize_path(path)
            if self.client.status(path, False) is not None:
                self.client.delete(path, True)
        except Exception as e:
            print(e)

    def rename(self, oldpath, newpath):
        try:
            oldpath = self.normalize_path(oldpath)
            newpath = self.normalize_path(newpath)
            self.client.rename(oldpath, newpath)
        except Exception as e:
            print(e)

    def get_files(self, path):
        path = self.normalize_path(path)
        files = []
        for f in self.client.list(path):
            status = self.client.status(join(path, f), False)
            if status['type'] != "DIRECTORY":
                files.append(f)
        return files

    def get_folders(self, path):
        path = self.normalize_path(path)
        folders = []
        for f in self.client.list(path):
            status = self.client.status(join(path, f), False)
            if status['type'] == "DIRECTORY":
                folders.append(f)
        return folders

    def exists(self, path):
        path = self.normalize_path(path)
        status = self.client.status(path, False)
        return not (status is None)

    def isdir(self, path):
        path = self.normalize_path(path)
        status = self.client.status(path, False)
        return status['type'] == "DIRECTORY"

    def isfile(self, path):
        path = self.normalize_path(path)
        status = self.client.status(path, False)
        return status['type'] == "FILE"

    def read(self, path):
        path = self.normalize_path(path)
        with self.client.read(path) as reader:
            return reader.read().decode('utf-8')

    def write(self, path, content):
        path = self.normalize_path(path)
        self.client.write(path, content)

    def make_json(self, path):
        normalized_path = self.normalize_path(path)
        data_json = {
            'path': urljoin(self.url, normalized_path),
            'text': os.path.basename(path)
        }
        status = self.client.status(normalized_path, False)

        if status is not None:
            data_json['folder'] = status['type'] == "DIRECTORY"
            if status['type'] == "DIRECTORY":
                data_json['nodes'] = [
                    self.make_json(os.path.join(path, fn))
                    for fn in self.client.list(normalized_path)
                ]
        #print(json.dumps(data_json))
        return data_json

    def save_upload(self, file, fullpath):
        localpath = os.path.join(tempfile.gettempdir(),
                                 os.path.basename(fullpath))
        if os.path.isfile(localpath):
            os.remove(localpath)
        try:
            file.save(localpath)
            if isfile(fullpath):
                fullpath = os.path.dirname(fullpath)
            self.client.upload(self.normalize_path(fullpath), localpath, True)
        except:
            pass

    def download(self, path):
        path = self.normalize_path(path)
        status = self.client.status(path, False)
        if status is not None and status['type'] == "FILE":
            localpath = os.path.join(tempfile.gettempdir(),
                                     os.path.basename(path))
            return self.client.download(path, localpath, True)
        else:
            return None
Example #18
0
class HDFSWrapper(object):
    def __init__(self):
        self.__m_HDFS_Handler__ = None
        self.__m_HDFS_WebFSDir__ = None
        self.__m_HDFS_User__ = None
        self.__m_HDFS_WebFSURL__ = None

    def HDFS_makedirs(self, hdfs_path):
        """ 创建目录 """
        if self.__m_HDFS_Handler__ is None:
            raise HDFSWrapperException(
                "HDFS not connected. Please connect it frist.")
        self.__m_HDFS_Handler__.makedirs(
            os.path.join(self.__m_HDFS_WebFSDir__,
                         hdfs_path).replace('\\', '/'))

    def HDFS_setPermission(self, hdfs_path, permission):
        """ 修改指定文件的权限信息 """
        if self.__m_HDFS_Handler__ is None:
            raise HDFSWrapperException(
                "HDFS not connected. Please connect it frist.")
        m_hdfs_filepath = os.path.dirname(hdfs_path)
        m_hdfs_filename = os.path.basename(hdfs_path)
        self.__m_HDFS_Handler__.set_permission(os.path.join(
            self.__m_HDFS_WebFSDir__, m_hdfs_filepath,
            m_hdfs_filename).replace('\\', '/'),
                                               permission=permission)

    def HDFS_Connect(self, p_szURL, p_szUser):
        """ 连接HDFS, URL使用WEBFS协议 """
        m_HDFS_Protocal = p_szURL.split("://")[0]
        m_HDFS_NodePort = p_szURL[len(m_HDFS_Protocal) + 3:].split("/")[0]
        m_HDFS_WebFSURL = m_HDFS_Protocal + "://" + m_HDFS_NodePort
        self.__m_HDFS_User__ = p_szUser
        self.__m_HDFS_WebFSURL__ = m_HDFS_WebFSURL
        self.__m_HDFS_WebFSDir__ = p_szURL[len(m_HDFS_WebFSURL):]
        self.__m_HDFS_Handler__ = InsecureClient(url=m_HDFS_WebFSURL,
                                                 user=p_szUser,
                                                 root=self.__m_HDFS_WebFSDir__)
        # 尝试创建目录,如果目录不存在的话
        self.__m_HDFS_Handler__.makedirs(
            self.__m_HDFS_WebFSDir__.replace('\\', '/'))

    def HDFS_CD(self, p_szPath):
        self.__m_HDFS_WebFSDir__ = os.path.join(self.__m_HDFS_WebFSDir__,
                                                p_szPath)
        self.__m_HDFS_Handler__ = InsecureClient(url=self.__m_HDFS_WebFSURL__,
                                                 user=self.__m_HDFS_User__,
                                                 root=self.__m_HDFS_WebFSDir__)
        # 尝试创建目录,如果目录不存在的话
        self.__m_HDFS_Handler__.makedirs(
            self.__m_HDFS_WebFSDir__.replace('\\', '/'))

    def HDFS_status(self, hdfs_path=""):
        """ 返回目录下的文件 """
        if self.__m_HDFS_Handler__ is None:
            raise HDFSWrapperException(
                "HDFS not connected. Please connect it frist.")

        m_ReturnList = []
        m_Status = self.__m_HDFS_Handler__.status(hdfs_path)
        m_ReturnList.append((hdfs_path, m_Status))
        return m_ReturnList

    def HDFS_list(self, hdfs_path="", recusive=False):
        """ 返回目录下的文件 """
        if self.__m_HDFS_Handler__ is None:
            raise HDFSWrapperException(
                "HDFS not connected. Please connect it frist.")

        m_ReturnList = []
        if not recusive:
            for row in self.__m_HDFS_Handler__.list(hdfs_path, status=True):
                m_ReturnList.append((os.path.join(hdfs_path, row[0]), row[1]))
            return m_ReturnList
        else:
            for row in self.__m_HDFS_Handler__.list(hdfs_path, status=True):
                if row[1]['type'].upper() == 'DIRECTORY':
                    m_ReturnList.append(
                        (os.path.join(hdfs_path, row[0]).replace("\\",
                                                                 "/"), row[1]))
                    m_ReturnList.extend(
                        self.HDFS_list(os.path.join(hdfs_path,
                                                    row[0]).replace("\\", "/"),
                                       recusive=True))
                else:
                    m_ReturnList.append(
                        (os.path.join(hdfs_path, row[0]).replace("\\",
                                                                 "/"), row[1]))
            return m_ReturnList

    def HDFS_Download(self, hdfs_path="", local_path="", recusive=False):
        """ 从hdfs获取文件到本地 """
        if self.__m_HDFS_Handler__ is None:
            raise HDFSWrapperException(
                "HDFS not connected. Please connect it frist.")

        # 如果本地没有对应目录,且local_path传递的是一个目录,则建立目录
        m_LocalPath = local_path
        if m_LocalPath.endswith("/") and not os.path.exists(m_LocalPath):
            os.makedirs(m_LocalPath)

        m_FileList = self.HDFS_list(recusive=recusive)
        for row in m_FileList:
            if fnmatch.fnmatch(row[0], hdfs_path):
                self.__m_HDFS_Handler__.download(row[0],
                                                 m_LocalPath,
                                                 overwrite=True)

    def HDFS_Upload(self, local_path, hdfs_path=""):
        """ 上传文件到hdfs """
        if self.__m_HDFS_Handler__ is None:
            raise HDFSWrapperException(
                "HDFS not connected. Please connect it frist.")

        for file in glob(local_path):
            if hdfs_path == "":
                m_hdfs_filepath = ""
                m_hdfs_filename = os.path.basename(file)
            else:
                if hdfs_path.endswith("/"):
                    m_hdfs_filepath = hdfs_path
                    m_hdfs_filename = os.path.basename(file)
                else:
                    m_hdfs_filepath = os.path.dirname(hdfs_path)
                    m_hdfs_filename = os.path.basename(hdfs_path)
            try:
                remote_status = self.__m_HDFS_Handler__.status(
                    hdfs_path=os.path.join(self.__m_HDFS_WebFSDir__,
                                           m_hdfs_filepath).replace('\\', '/'),
                    strict=True)
                if remote_status['type'] == "FILE":
                    # 远程以为是目录的地方其实放了一个奇怪的文件,于是删掉它
                    self.__m_HDFS_Handler__.delete(os.path.join(
                        self.__m_HDFS_WebFSDir__,
                        m_hdfs_filepath).replace('\\', '/'),
                                                   recursive=True)
                remote_status = self.__m_HDFS_Handler__.status(
                    os.path.join(self.__m_HDFS_WebFSDir__, m_hdfs_filepath,
                                 m_hdfs_filename).replace('\\', '/'))
                if remote_status['type'] == "DIRECTORY":
                    # 远程目录已经存在, 会尝试删除这个目录
                    self.__m_HDFS_Handler__.delete(os.path.join(
                        self.__m_HDFS_WebFSDir__, m_hdfs_filepath,
                        m_hdfs_filename).replace('\\', '/'),
                                                   recursive=True)
            except HdfsError:
                # 远程目录不存在,后续的upload会建立该目录
                pass
            self.__m_HDFS_Handler__.upload(os.path.join(
                self.__m_HDFS_WebFSDir__, m_hdfs_filepath,
                m_hdfs_filename).replace('\\', '/'),
                                           file,
                                           overwrite=True,
                                           cleanup=True)

    def Process_SQLCommand(self, p_szSQL):
        try:
            m_szSQL = p_szSQL.strip()
            matchObj = re.match(r"hdfs\s+connect\s+(.*)\s+with\s+user\s+(.*)$",
                                m_szSQL, re.IGNORECASE | re.DOTALL)
            if matchObj:
                m_HDFSServer = str(matchObj.group(1)).strip()
                m_HDFSUser = str(matchObj.group(2)).strip()
                self.HDFS_Connect(m_HDFSServer, m_HDFSUser)
                return None, None, None, None, "Hdfs Server set successful."

            matchObj = re.match(r"hdfs\s+cd\s+(.*)$", m_szSQL,
                                re.IGNORECASE | re.DOTALL)
            if matchObj:
                m_HDFSPath = str(matchObj.group(1)).strip()
                self.HDFS_CD(m_HDFSPath)
                return None, None, None, None, "Hdfs root dir change successful."

            matchObj = re.match(r"hdfs\s+status\s+(.*)$", m_szSQL,
                                re.IGNORECASE | re.DOTALL)
            if matchObj:
                m_TargetFileList = str(matchObj.group(1)).strip()
                m_ReturnFileList = self.HDFS_status(m_TargetFileList)
                m_Result = []
                for (m_FileName, m_FileProperties) in m_ReturnFileList:
                    if m_FileProperties["type"] == "FILE":
                        m_PermissionMask = "-"
                    elif m_FileProperties["type"] == "DIRECTORY":
                        m_PermissionMask = "d"
                    else:
                        m_PermissionMask = "?"
                    if len(m_FileProperties["permission"]) == 3:
                        for m_nPos in range(0, 3):
                            if m_FileProperties["permission"][m_nPos] == "0":
                                m_PermissionMask = m_PermissionMask + "---"
                            elif m_FileProperties["permission"][m_nPos] == "1":
                                m_PermissionMask = m_PermissionMask + "--x"
                            elif m_FileProperties["permission"][m_nPos] == "2":
                                m_PermissionMask = m_PermissionMask + "-w-"
                            elif m_FileProperties["permission"][m_nPos] == "3":
                                m_PermissionMask = m_PermissionMask + "-wx"
                            elif m_FileProperties["permission"][m_nPos] == "4":
                                m_PermissionMask = m_PermissionMask + "r--"
                            elif m_FileProperties["permission"][m_nPos] == "5":
                                m_PermissionMask = m_PermissionMask + "r-x"
                            elif m_FileProperties["permission"][m_nPos] == "6":
                                m_PermissionMask = m_PermissionMask + "rw-"
                            elif m_FileProperties["permission"][m_nPos] == "7":
                                m_PermissionMask = m_PermissionMask + "rwx"
                            else:
                                m_PermissionMask = m_PermissionMask + "???"
                    else:
                        m_PermissionMask = m_PermissionMask + "?????????"
                    m_ModifiedTime = str(
                        datetime.datetime.utcfromtimestamp(
                            m_FileProperties["modificationTime"] /
                            1000).strftime("%Y-%m-%d %H:%M:%S"))
                    m_Result.append([
                        m_TargetFileList, m_PermissionMask,
                        m_FileProperties["owner"], m_FileProperties["group"],
                        m_FileProperties["length"], m_ModifiedTime
                    ])
                return "HDFS file status:", m_Result, ["Path", "Permission", "owner", "group", "Size", "Modified"], \
                       None, "Total " + str(len(m_Result)) + " files listed."

            matchObj = re.match(r"hdfs\s+rm\s+(.*)$", m_szSQL,
                                re.IGNORECASE | re.DOTALL)
            if matchObj:
                if matchObj:
                    m_Bak_WebFSDir = self.__m_HDFS_WebFSDir__
                    m_FileDeleted = str(matchObj.group(1)).strip()
                    m_FileDeletedPath = os.path.dirname(m_FileDeleted)
                    m_FileDeletedName = os.path.basename(m_FileDeleted)
                    self.HDFS_CD(m_FileDeletedPath)
                    m_FileList = self.HDFS_list(self.__m_HDFS_WebFSDir__,
                                                recusive=False)
                    for row in m_FileList:
                        if fnmatch.fnmatch(os.path.basename(row[0]),
                                           m_FileDeletedName):
                            self.__m_HDFS_Handler__.delete(row[0],
                                                           recursive=True)
                    # 重新返回原目录
                    self.HDFS_CD(m_Bak_WebFSDir)
                return None, None, None, None, "Hdfs file deleted successful."

            matchObj = re.match(r"hdfs\s+makedirs\s+(.*)$", m_szSQL,
                                re.IGNORECASE | re.DOTALL)
            if matchObj:
                m_Dir = str(matchObj.group(1)).strip()
                self.HDFS_makedirs(m_Dir)
                return None, None, None, None, "Hdfs directory created successful."

            matchObj = re.match(r"hdfs\s+set_permission\s+(.*)\s+(.*)$",
                                m_szSQL, re.IGNORECASE | re.DOTALL)
            if matchObj:
                m_File = str(matchObj.group(1)).strip()
                m_FilePermission = str(matchObj.group(2)).strip()
                self.HDFS_setPermission(m_File, m_FilePermission)
                return None, None, None, None, "Hdfs set permission successful."

            m_FileUpload = ""
            m_TargetDir = None
            matchObj = re.match(r"hdfs\s+upload\s+(.*)$", m_szSQL,
                                re.IGNORECASE | re.DOTALL)
            if matchObj:
                m_FileUpload = str(matchObj.group(1)).strip()
                m_TargetDir = ""
            matchObj = re.match(r"hdfs\s+upload\s+(.*)\s+(.*)$", m_szSQL,
                                re.IGNORECASE | re.DOTALL)
            if matchObj:
                m_FileUpload = str(matchObj.group(1)).strip()
                m_TargetDir = str(matchObj.group(2)).strip()
            if m_TargetDir is not None:
                self.HDFS_Upload(m_FileUpload, m_TargetDir)
                return None, None, None, None, "Hdfs file upload successful."

            m_FileDownload = ""
            m_TargetDir = None
            matchObj = re.match(r"hdfs\s+download\s+(.*)$", m_szSQL,
                                re.IGNORECASE | re.DOTALL)
            if matchObj:
                m_FileDownload = str(matchObj.group(1)).strip()
                m_TargetDir = ""
            matchObj = re.match(r"hdfs\s+download\s+(.*)\s+(.*)$", m_szSQL,
                                re.IGNORECASE | re.DOTALL)
            if matchObj:
                m_FileDownload = str(matchObj.group(1)).strip()
                m_TargetDir = str(matchObj.group(2)).strip()
            if m_TargetDir is not None:
                self.HDFS_Download(m_FileDownload, m_TargetDir)
                return None, None, None, None, "Hdfs file download successful."

            m_TargetFileList = None
            matchObj = re.match(r"hdfs\s+list(\s+)?$", m_szSQL,
                                re.IGNORECASE | re.DOTALL)
            if matchObj:
                m_TargetFileList = ""
            matchObj = re.match(r"hdfs\s+list\s+(.*)?$", m_szSQL,
                                re.IGNORECASE | re.DOTALL)
            if matchObj:
                m_TargetFileList = str(matchObj.group(1)).strip()
            if m_TargetFileList is not None:
                m_ReturnFileList = self.HDFS_list(m_TargetFileList,
                                                  recusive=True)
                m_Result = []
                for (m_FileName, m_FileProperties) in m_ReturnFileList:
                    if m_FileProperties["type"] == "FILE":
                        m_PermissionMask = "-"
                    elif m_FileProperties["type"] == "DIRECTORY":
                        m_PermissionMask = "d"
                    else:
                        m_PermissionMask = "?"
                    if len(m_FileProperties["permission"]) == 3:
                        for m_nPos in range(0, 3):
                            if m_FileProperties["permission"][m_nPos] == "0":
                                m_PermissionMask = m_PermissionMask + "---"
                            elif m_FileProperties["permission"][m_nPos] == "1":
                                m_PermissionMask = m_PermissionMask + "--x"
                            elif m_FileProperties["permission"][m_nPos] == "2":
                                m_PermissionMask = m_PermissionMask + "-w-"
                            elif m_FileProperties["permission"][m_nPos] == "3":
                                m_PermissionMask = m_PermissionMask + "-wx"
                            elif m_FileProperties["permission"][m_nPos] == "4":
                                m_PermissionMask = m_PermissionMask + "r--"
                            elif m_FileProperties["permission"][m_nPos] == "5":
                                m_PermissionMask = m_PermissionMask + "r-x"
                            elif m_FileProperties["permission"][m_nPos] == "6":
                                m_PermissionMask = m_PermissionMask + "rw-"
                            elif m_FileProperties["permission"][m_nPos] == "7":
                                m_PermissionMask = m_PermissionMask + "rwx"
                            else:
                                m_PermissionMask = m_PermissionMask + "???"
                    else:
                        m_PermissionMask = m_PermissionMask + "?????????"
                    m_ModifiedTime = str(
                        datetime.datetime.utcfromtimestamp(
                            m_FileProperties["modificationTime"] /
                            1000).strftime("%Y-%m-%d %H:%M:%S"))
                    m_Result.append([
                        m_FileProperties["pathSuffix"], m_PermissionMask,
                        m_FileProperties["owner"], m_FileProperties["group"],
                        m_FileProperties["length"], m_ModifiedTime
                    ])
                return "HDFS file List:", m_Result, ["Path", "Permission", "owner", "group", "Size", "Modified"], \
                       None, "Total " + str(len(m_Result)) + " files listed."
            return None, None, None, None, "Unknown HDFS Command."
        except (HDFSWrapperException, HdfsError) as he:
            if "SQLCLI_DEBUG" in os.environ:
                print('traceback.print_exc():\n%s' % traceback.print_exc())
                print('traceback.format_exc():\n%s' % traceback.format_exc())
            raise SQLCliException(he.message)
import pandas as pd
#%% 
hdfs_client = InsecureClient('http://10.10.250.10:50070', timeout=1)
hdfs_client
#%% 
from datetime import date

# hdfs_path = '/projects/projectfinder/raw/items/' +\
#    date.today().year.__str__() + '/' +\
#    date.today().month.__str__() + '/'

#%% 
hdfs_path = '/projects/projectfinder/raw/items/2019'

#%%
hdfs_client.download(hdfs_path, 'hdfs_data', n_threads=5)


#%% 

hdfs_client_status = hdfs_client.status('/', strict=True)
hdfs_client_status

#%% 
hdfs_file_status = hdfs_client.list(hdfs_path)
hdfs_file_status


#%% [markdown]
# Go to [manuel](https://hdfscli.readthedocs.io/en/latest/advanced.html#path-expansion)
# ```bash
Example #20
0
class HDFSStorage(Storage):
    """
    HDFS storage
    """

    def fix_slashes(self, path):
        sep = os.path.sep
        if path[0] != sep:
            path = sep + path
        if path[-1] != sep:
            path = path + sep
        return path

    def __init__(self, location=None, base_url=None):
        self.hdfs_hosts = settings.HDFS_STORAGE['hosts']
        self.hdfs_root = self.fix_slashes(settings.HDFS_STORAGE['root'])
        self.media_root = settings.MEDIA_ROOT
        self.media_url = self.fix_slashes(settings.MEDIA_URL)

        self.fetch_url = '%s/webhdfs/v1%s%%s?op=OPEN' % (self.hdfs_hosts.split(',')[0], self.hdfs_root)
        self.client = InsecureClient(self.hdfs_hosts)

    def _open(self, name, mode='rb'):
        local_path = os.path.join(settings.MEDIA_ROOT, name.replace('/', os.path.sep))
        if not os.path.exists(local_path):
            remote_path = self.path(name)
            local_dir = os.path.dirname(local_path)
            if not os.path.exists(local_dir):
                os.mkdir(local_dir)
            print self.client.download(remote_path, local_path=local_path, overwrite=True,
                                       temp_dir=tempfile.gettempdir())
        return File(open(local_path, mode))

    def _save(self, name, content):
        print "_save(%s, %s, %s)" % (self, name, content)
        local_path = content.name
        hdfs_path = self.path(name)  # os.path.basename(local_path))
        print hdfs_path, local_path
        self.client.write(hdfs_path, data=content, overwrite=True)
        return name

    def url(self, name):
        return self.fetch_url % name

    def delete(self, name):
        return self.client.delete(self.path(name))

    def listdir(self, path):
        file_list = []
        dir_list = []
        for name, status in self.client.list(self.path(path), status=True):
            if status['type'] == 'DIRECTORY':
                dir_list.append(name)
            elif status['type'] == 'FILE':
                file_list.append(name)
        return dir_list, file_list

    def size(self, name):
        return self.client.status(self.path(name))['length']

    def exists(self, name):
        try:
            return True if self.client.status(self.path(name)) else False
        except HdfsError:
            return False

    def path(self, name):
        return (self.hdfs_root + name).replace('\\', '/')
Example #21
0
 def test_file(self):
     client_hdfs = InsecureClient('http://127.0.0.1' + ':50070')
     self.assertTrue(client_hdfs.download("/user/maria_dev/dataset1", ""))
Example #22
0
class HDFSStorage(Storage):
    """
    HDFS storage
    """

    def fix_slashes(self, path):
        sep = os.path.sep
        if path[0] != sep:
            path = sep + path
        if path[-1] != sep:
            path = path + sep
        return path

    def __init__(self, location=None, base_url=None):
        self.hdfs_hosts = settings.HDFS_STORAGE['hosts']
        self.hdfs_root = self.fix_slashes(settings.HDFS_STORAGE['root'])
        self.media_root = settings.MEDIA_ROOT
        self.media_url = self.fix_slashes(settings.MEDIA_URL)

        self.fetch_url = '%s/webhdfs/v1%s%%s?op=OPEN' % (self.hdfs_hosts.split(',')[0], self.hdfs_root)
        self.client = InsecureClient(self.hdfs_hosts)

    def _open(self, name, mode='rb'):
        local_path = os.path.join(settings.MEDIA_ROOT, name.replace('/', os.path.sep))
        if not os.path.exists(local_path):
            remote_path = self.path(name)
            local_dir = os.path.dirname(local_path)
            if not os.path.exists(local_dir):
                os.makedirs(local_dir)
            print(self.client.download(remote_path, local_path=local_path, overwrite=True,
                                       temp_dir=tempfile.gettempdir()))
        return File(open(local_path, mode))

    def _save(self, name, content):
        print("_save(%s, %s, %s)" % (self, name, content))
        local_path = content.name
        hdfs_path = self.path(name)  # os.path.basename(local_path))
        print(hdfs_path, local_path)
        self.client.write(hdfs_path, data=content, overwrite=True)
        return name

    def url(self, name):
        return self.fetch_url % name

    def delete(self, name):
        return self.client.delete(self.path(name))

    def listdir(self, path):
        file_list = []
        dir_list = []
        for name, status in self.client.list(self.path(path), status=True):
            if status['type'] == 'DIRECTORY':
                dir_list.append(name)
            elif status['type'] == 'FILE':
                file_list.append(name)
        return dir_list, file_list

    def size(self, name):
        return self.client.status(self.path(name))['length']

    def exists(self, name):
        try:
            return True if self.client.status(self.path(name)) else False
        except HdfsError:
            return False

    def path(self, name):
        return (self.hdfs_root + name).replace('\\', '/')
Example #23
0
class HdfsClient:
    def __init__(self, namenode_host):
        self._client = InsecureClient(f'http://{namenode_host}:9870')

    def download(self, remote_hdfs_path, local_path):
        self._client.download(remote_hdfs_path, local_path, overwrite=True)
def config():
    json_available = False
    # Directly pretrained
    if request.json \
        and 'pretrained' in request.json  \
        and 'folder_path' in request.json  \
        and 'model_name' in request.json  \
        and 'hdfs_uri' in request.json:
        # Download files from HDFS
        model_name = request.json['model_name']
        hdfs_uri = request.json['hdfs_uri']
        client_hdfs = InsecureClient(hdfs_uri)
        handler_file = request.json['folder_path'] + "/handler.py"
        client_hdfs.download(handler_file, "./handler.py", overwrite=True)
        logger.info('Download files: OK!!')
        # Make model archives
        file_params = open('/' + 'params.pt', 'w')
        file_params.close()
        file_model = open('/' + 'model.py', 'w')
        file_model.close()
        response = os.popen(
            """torch-model-archiver --model-name %s --version 1.0 --model-file ./model.py --serialized-file ./params.pt --handler ./handler.py && mv %s.mar /home/model-server/model-store/"""
            % (model_name, model_name)).read().strip()
        # Install library dependencies
        json_available = True
        logger.info('Uploaded model: %s' % model_name)
    # Fine-tuned
    else:
        # Read file paths from Json input
        # Input as path of folder
        if request.json \
            and 'folder_path' in request.json \
            and 'model_name' in request.json \
            and 'hdfs_uri' in request.json:
            handler_file = request.json['folder_path'] + "/handler.py"
            model_file = request.json['folder_path'] + "/model.py"
            params_file = request.json['folder_path'] + "/params.pt"
            model_name = request.json['model_name']
            hdfs_uri = request.json['hdfs_uri']
        # Input as path of files
        elif request.json \
            and 'hdfs_uri' in request.json \
            and 'handler' in request.json  \
            and 'params' in request.json \
            and 'model' in request.json \
            and 'model_name' in request.json:
            handler_file = request.json['handler']
            model_file = request.json['model']
            params_file = request.json['params']
            model_name = request.json['model_name']
            hdfs_uri = request.json['hdfs_uri']

        logger.info('Read json configurations: OK!!')
        # Download files from HDFS
        client_hdfs = InsecureClient(hdfs_uri)
        client_hdfs.download(handler_file, "./handler.py", overwrite=True)
        client_hdfs.download(model_file, "./model.py", overwrite=True)
        client_hdfs.download(params_file, "./params.pt", overwrite=True)
        logger.info('Download files: OK!!')
        # Make model archives
        response = os.popen(
            """torch-model-archiver --model-name %s --version 1.0 --model-file ./model.py --serialized-file ./params.pt --handler ./handler.py && mv %s.mar /home/model-server/model-store/"""
            % (model_name, model_name)).read().strip()
        current_app.hdfs_uri = hdfs_uri
        current_app.configured = True
        json_available = True
        logger.info('Uploaded model: %s' % model_name)

    if not (json_available):
        abort(400)

    return jsonify({'response': response}), 201
Example #25
0
def handleHdfsDownload(hdfs_path, local_path):
    client = InsecureClient("http://hdfs.neurolearn.com:50070", user="******")
    client.download(hdfs_path, local_path, overwrite=True)
    print('Downloaded Images from HDFS.')
    return local_path
Example #26
0
class HadoopFileSystem(object):
    def __init__(self, *opts):
        self.client = InsecureClient(current_app.config['WEBHDFS_ADDR'], user=current_app.config['WEBHDFS_USER'])
         
#     def make_tree(self, datasourceid, client, path):
#         tree = dict(name=(os.path.basename(path), datasourceid + os.path.sep + path), children=[])
#         try: lst = client.list(path, status=True)
#         except:
#             pass #ignore errors
#         else:
#             for fsitem in lst:
#                 fn = os.path.join(path, fsitem[0])
#                 if fsitem[1]['type'] == "DIRECTORY":
#                     tree['children'].append(make_hdfs_tree(datasourceid, client, fn))
#                 else:
#                     tree['children'].append({'name' : (fsitem[0], datasourceid + os.path.sep + fn), 'children' : []})
#         return tree

    def make_json(self, datasourceid, base, relative_path):
        path = os.path.join(base, relative_path)
        data_json = {'datasource': datasourceid, 'path': relative_path, 'name': os.path.basename(relative_path) }
        status = self.client.status(path, False)

        if status is not None:
            if status['type'] == "DIRECTORY":
                data_json['type'] = DataType.Folder
                data_json['children'] = [self.make_json(datasourceid, base, os.path.join(relative_path, fn)) for fn in self.client.list(path)]
            else:
                data_json['type'] = DataType.File
        #print(json.dumps(data_json))
        return data_json
    
    def makedirs(self, path):
        try: 
            self.client.makedirs(path)
        except:
            return None
        return path
    
    def delete(self, path):
        try: 
            if self.client.status(path, False) is not None:
                self.client.delete(path, True)
        except Exception as e: print(e)
        
    def addfolder(self, path):
        i = 0
        while self.client.status(os.path.join(path, "New Folder ({0})".format(i)), False) is None:
            i += 1
        return self.makedirs(os.path.join(path, "New Folder ({0})".format(i)))
    
    def rename(self, oldpath, newpath):
        try:
            self.client.rename(oldpath, newpath)
        except Exception as e: print(e)
    
    def saveUpload(self, file, fullpath):
        localpath = os.path.join(tempfile.gettempdir(), os.path.basename(fullpath))
        if os.path.isfile(localpath):
            os.remove(localpath)
        try:
            file.save(localpath)
            self.client.upload(os.path.dirname(fullpath), localpath, True)
        except:
            pass
        
    def download(self, fullpath):
        status = self.client.status(fullpath, False)
        if status is not None and status['type'] == "FILE":
            localpath = os.path.join(tempfile.gettempdir(), os.path.basename(fullpath))
            return self.client.download(fullpath, localpath, True)
        else:
            return None
Example #27
0
# Посмотрим, что у нас есть в рабочей директории
print(client.list('/student9_7'))
'''
['cur_readme', 'googlobots.txt', 'py_dir_02', 'readme', 'test', 'test2', 'testdir']
'''


# Посмотрим размер нашей рабочей директории
print(client.content('/student9_7'))
'''
{'directoryCount': 3, 'fileCount': 5, 'length': 10552, 'quota': -1, 'spaceConsumed': 31637, 'spaceQuota': -1}
'''


# Прочитаем файл `test`
with client.read('/student9_7/test') as reader:
  test = reader.read()
print(test)
'''
b'test file for hdfs\n'
'''


# Скопируем файл `test` из хранилища в локальную домашнюю директорию под именем `downloaded_file_via_py3`
client.download('/student9_7/test', 'downloaded_file_via_py3', n_threads=5)
'''
'/home/student9_7/downloaded_file_via_py3'
'''