Example #1
0
    def find_hdfs_namenode_address():

        for nd in namenodes:
            hdfs_client = InsecureClient(url=nd, user=user)
            try:
                hdfs_client.list('/')
                return nd
            except:
                continue
        raise Exception("No available name node.")
Example #2
0
def create_csv(directory_to):
    hdfs_client = InsecureClient('http://192.168.1.4:9870', user='******')
    list_yes = hdfs_client.list('/' + directory_to + 'yes')
    list_images = ['yes/' + name for name in list_yes]
    list_no = hdfs_client.list('/' + directory_to + 'no/')
    list_images += ['no/' + name for name in list_no]

    data = pd.DataFrame(list_images, columns=['Path'])

    with hdfs_client.write('/' + directory_to + 'data.csv',
                           encoding='utf-8') as writer:
        data.to_csv(writer, index_label='index')
def download(keyword):
    client = InsecureClient("http://ip_address", user="******")
    root_dir = "/username/dps"
    for folder in client.list(root_dir):
        if keyword not in folder:
            continue
        os.makedirs(os.path.join("data", folder), exist_ok=True)
        for file in client.list(root_dir + "/" + folder):
            target_path = os.path.join("data", folder, file)
            logging.info("Downloading for {}".format(target_path))
            if os.path.exists(target_path):
                logging.warning("{} already exists!".format(target_path))
                continue
            with open(target_path, "wb") as writer, client.read("{}/{}/{}".format(root_dir, folder, file)) as reader:
                writer.write(reader.read())
Example #4
0
def renameFiles(ip='172.20.10.2',
                port='9870',
                username='******',
                MainName='result-part',
                SubName='.json',
                dirPath='/tmp/Cathay/'):
    client = InsecureClient("http://" + ip + ":" + port, user=username)
    if dirPath[-1] != '/':
        dirPath += '/'
    fns = client.list(dirPath)
    for fn in fns:
        if 'part-' in fn:
            num = str(int(fn.split('part-')[-1]) + 1)
            client.rename(dirPath + fn, dirPath + MainName + num + SubName)
    return str(fns) + "\n     Change to     \n" + str(client.list(dirPath))
Example #5
0
def main():

    client = InsecureClient(f'http://127.0.0.1:50070/', user='******')

    # create directory in HDFS
    client.makedirs('/test')

    #list content
    ll = client.list('/')
    print(ll)

    # create file in HDFS
    data = [{
        "name": "Anne",
        "salary": 10000
    }, {
        "name": "Victor",
        "salary": 9500
    }]
    with client.write('/test/sample_file.json',
                      encoding='utf-8') as json_file_in_hdfs:
        json.dump(data, json_file_in_hdfs)
    # OR
    client.write(os.path.join('/', 'test', 'sample_file2.json'),
                 data=json.dumps(data),
                 encoding='utf-8')

    # download file from HDFS
    client.download('/test/sample_file.json', './file_from_hadoop.json')

    # upload file to HDFS
    client.upload('/test/local_file_in_hadoop.json', './file_from_hadoop.json')
Example #6
0
def uploadHDFS(filename):
    #    if ' ' in filename:
    #        aoi_file = rename(aoi_file)
    #    else:
    aoi_file = filename
    client = InsecureClient('http://10.41.158.65:50070', user='******')
    fname1 = client.list(hdfs_path)
    #    if aoi_file.split("@").count('-') >= 2:
    dt = str(aoi_file.split("@")[1].split("-", 1)[0][:8])
    #    else:
    #        dt = str(aoi_file.split("_")[-1].split("-")[-2].split("@")[1][:8])
    #        dt = str(aoi_file.split("@")[1].split("-",1)[0][:8])
    folder1 = dt
    if folder1 in fname1:
        client.upload(hdfs_path + folder1 + "/" + aoi_file,
                      upload_path + aoi_file,
                      overwrite=True)
        print "uploadHDFS ok"
        shutil.move(upload_path + aoi_file, backup_path + aoi_file)
    else:
        client.makedirs(hdfs_path + folder1)
        client.upload(hdfs_path + folder1 + "/" + aoi_file,
                      upload_path + aoi_file,
                      overwrite=True)
        print "uploadHDFS ok"
        shutil.move(upload_path + aoi_file, backup_path + aoi_file)
Example #7
0
class HdfsClient:
    def __init__(self, namenode_host, datanode_host):
        self._namenode_host = namenode_host
        self._datanode_host = datanode_host
        self._client = InsecureClient(f'http://{self._namenode_host}:9870')
        self._logger = logging.getLogger(__name__)
        self._logger.setLevel(
            logging.getLevelName(os.getenv("LOG_LEVEL", "INFO")))
        ch = logging.StreamHandler()
        ch.setLevel(logging.getLevelName(os.getenv("LOG_LEVEL", "INFO")))
        self._logger.addHandler(ch)

    def list(self, path):
        return self._client.list(path)

    def get_stream(self, hdfs_path):
        request_path = f'http://{self._datanode_host}:9864/webhdfs/v1{hdfs_path}?op=OPEN&namenoderpcaddress={self._namenode_host}:9000&offset=0'
        return HdfsClientGetStream(request_path)

    def upload_to_hdfs(self, local_path, remote_path):
        self._logger.info(f'Upload local path {local_path} to {remote_path}')
        with open(local_path, 'rb') as f:
            r = requests.put(
                f'http://{self._namenode_host}:9870/webhdfs/v1{remote_path}?op=CREATE&overwrite=true',
                data=f)
            self._logger.debug(f'Upload result {r.content}')
Example #8
0
class Storage:
    def __init__(self, protocol: str = 'webHDFS', *args, **kwargs):
        self.protocol, self.client = protocol.lower(), None
        if protocol.lower() == 'webHDFS'.lower():
            from hdfs import InsecureClient
            self.client = InsecureClient(*args, **kwargs)
            for f in 'upload download list status delete'.split():
                setattr(self, f, getattr(self,
                                         '%s_%s' % (f, protocol.lower())))

    def upload_webhdfs(self, local_path: str, remote_path: str, **kwargs):
        to_screen("upload %s -> %s" % (local_path, remote_path))
        return self.client.upload(local_path=local_path,
                                  hdfs_path=remote_path,
                                  **kwargs)

    def download_webhdfs(self, remote_path: str, local_path: str, **kwargs):
        mkdir_for(local_path)
        to_screen("download %s -> %s" % (remote_path, local_path))
        return self.client.download(local_path=local_path,
                                    hdfs_path=remote_path,
                                    overwrite=True,
                                    **kwargs)

    def list_webhdfs(self, remote_path: str, **kwargs):
        return self.client.list(hdfs_path=remote_path, **kwargs)

    def status_webhdfs(self, remote_path: str, **kwargs):
        return self.client.status(hdfs_path=remote_path, **kwargs)

    def delete_webhdfs(self, remote_path: str, **kwargs):
        return self.client.delete(hdfs_path=remote_path, **kwargs)
Example #9
0
class HDFSService(object):
    def __init__(self):
        self.hdfs = InsecureClient('http://127.0.0.1:9870', user='******')
        self.base_path = '/users/root'

    def mkdir(self, path):
        return self.hdfs.makedirs(path)

    def list(self, path):
        try:
            return self.hdfs.list(path)
        except HdfsError as e:
            print(e)
            return []

    def get(self, path):
        pass

    def upload(self, path, local_path=None, data=None):
        path = self.base_path + path
        if data is not None:
            return self.hdfs.write(path, data=data)
        elif local_path is not None:
            return self.hdfs.upload(path, local_path)
        return False
        pass

    def download(self, path):
        path = self.base_path + path
        with self.hdfs.read(path) as reader:
            print(path)
            buf = reader.read()
        print(len(buf))
        return buf
Example #10
0
class DataProcessor:
    def __init__(self, data_path=None):
        if data_path is None:
            self.data_path = r'./config/connect_info.json'
        else:
            assert type(data_path) == str
            self.data_path = data_path
        if not os.path.exists(self.data_path):
            self.data_path = r'./connect_info.json'

        with open(self.data_path) as data_file:
            data = json.load(data_file)
            self.hdfs_client = InsecureClient(
                url='http://' + data['namenode_url'] + ':' + str(data['port']),
                user=data['user'],
                root=data['root_path'])
            self.img_dir = data['img_dir']

        if self.img_dir[-1] != '/':
            self.img_dir += '/'

        self.file_name = 1

    def InitImgDir(self):
        try:
            list_rslt = self.hdfs_client.list(self.img_dir)
            if len(list_rslt) > 0:
                for name in list_rslt:
                    file_path = self.img_dir + name
                    self.hdfs_client.delete(file_path)

        except util.HdfsError:
            self.hdfs_client.makedirs(self.img_dir)
            print("Mkdir ...")

        return True

    def DataProcess(self, data, append=False, file_name=None):
        assert type(data) == str
        if file_name is None:
            file_name = self.img_dir + str(self.file_name)
        else:
            assert (type(file_name)) == str
        print("start writing...")
        start = time.time()
        self.hdfs_client.write(file_name,
                               data,
                               overwrite=True,
                               replication=1,
                               append=append)
        delta = time.time() - start
        print("writing complete, time delta is " + str(delta))
        return True

    def Upload(self, remote_name, local_path):
        assert os.path.exists(local_path)

        remote_path = self.img_dir + remote_name
        self.hdfs_client.upload(remote_path, local_path, True)
        return True
Example #11
0
class HdfsWrapper:
    def __init__(self):
        self.client = None

    def connect_hdfs(self):
        self.client = InsecureClient(CONST.HDFS_URL, user=CONST.HDFS_USER)

    def mkdir_hdfs(self, path):
        if not exists(path):
            self.client.makedirs(path)

    def list_hdfs(self, path):
        return self.client.list(path)

    def read_hdfs(self, hdfs_path):
        try:
            with self.client.read(hdfs_path) as reader:
                return reader.read()
        except:
            log.error(traceback.format_exc())
            self.connect_hdfs()
            log.error('reconnect hdfs...')

    def write_hdfs(self, hdfs_path, data, overwrite=False):
        try:
            with self.client.write(hdfs_path, overwrite=overwrite) as writer:
                writer.write(data)
            return hdfs_path
        except:
            log.error(traceback.format_exc())
            self.connect_hdfs()
            log.error('reconnect hdfs...')

    def delete_hdfs(self, hdfs_path, recursive=False):
        return self.client.delete(hdfs_path, recursive)
Example #12
0
def delJPG_Newmodel(basepath):
    now_time = datetime.datetime.now()
    now_date_str = now_time.strftime('%Y%m%d')
    now_date = datetime.datetime.strptime(now_date_str, '%Y%m%d')
    try:
        # client = Client('http://10.41.158.72:50070')

        # client = InsecureClient('http://10.41.158.106:50075', user='******')
        client = InsecureClient('http://10.41.158.65:50070', user='******')
        # path="/P8AOI"
        # path1="C:/Users/z18073048/Desktop/bigdata/X1778-ANSI-BOT_20200813_TB1-F11-TRI-05@20200813094718-FPW03354EX3P49WBS.JPG"
        # client.upload(path,path1,cleanup=True)
        folderlist = client.list(basepath)
        newmodel = getnewmodel()
        for i in range(len(folderlist)):
            if isinstance(folderlist[i], unicode):
                #if isinstance(folderlist[i],list):
                folderlist[i] = folderlist[i].decode('string_escape')
            fname = folderlist[i]
            #print folderlist[i]
            #if  (fname=='X1777' or fname=='X1778' or fname=='Errormodel'):
            if (fname in newmodel):
                folderlist1 = client.list(basepath + '/' + fname)
                print 'newmodel:', folderlist[i], folderlist1
                for i in range(len(folderlist1)):
                    if isinstance(folderlist1[i], list):
                        folderlist1[i] = folderlist1[i].decode('string_escape')
                    date_flag = is_valid_date(folderlist1[i])
                    #print date_flag
                    if date_flag == 'true':
                        folderItem = datetime.datetime.strptime(
                            folderlist1[i], '%Y%m%d')
                        if folderItem + datetime.timedelta(
                                days=365) <= now_date:
                            paths = basepath + fname + '/' + folderlist1[i]
                            delHbase(folderlist1[i], client, paths)
                            deleteKudu(folderlist1[i], client, paths)

                            try:
                                client.delete(paths, recursive=True)
                                print paths + ' is delete'

                            except Exception as e:
                                print e
    except Exception as e:
        print e
def load_file_list_from_hdfs(data_package):
	if log_type in ['time','all']: st = time.time()
	hdfs_str  = data_package.stream_hdfs_file_name
	hdfs_addr = hdfs_str[:hdfs_str.rfind('0/')+1]
	hdfs_path = hdfs_str[hdfs_str.rfind('0/')+2:]


	client = InsecureClient(hdfs_addr, user=getpass.getuser())
	return client.list(hdfs_path), hdfs_path
def read_by_small():
    client = InsecureClient(HDFS_URL, user=HDFS_USERNAME)
    files_list = client.list(HDFS_DIR)
    images = []

    for fn in files_list:
        with client.read(hdfs_path=os.path.join(HDFS_DIR, fn)) as reader:
            img = reader.read()
            images.append(img)

    print(len(img))
Example #15
0
def load_enedis():

    client = InsecureClient('http://localhost:50070', user='******')
    client.makedirs('data')
    print(client.list('/user/cloudera'))

    # load 10 lignes
    client.upload(
        '/user/cloudera/data',
        '/home/fitec/projet_fil_rouge/source_des_données/data/consommation_elec_regions_2019_l10.json',
        overwrite=True)
Example #16
0
class HdfsDb(object):
    HOST = '192.168.71.156'
    PORT = 50070
    USER = '******'
    HOST_URI = 'http://{0}:{1}'.format(HOST, PORT)

    def __init__(self):
        self.client = InsecureClient(self.HOST_URI, user=self.USER)

    @check_dir_path
    def list_dir(self, dir_path=None):
        """
        列出根目录
        :return:
        """
        dir_data = self.client.list(dir_path)
        return dir_data

    @check_dir_path
    def mk_dir(self, dir_path=None):
        self.client.makedirs(dir_path)

    def write_file(self, filename, data, dir_path=None):
        """
        写入文件
        hd.write_file('test.json', {'name': 'zhexiao'}, dir_path='/data')
        :param filename:
        :param data:
        :param dir_path:
        :return:
        """
        file_path = '{0}/{1}'.format(dir_path, filename)
        self.client.write(file_path, str(data))

    @check_dir_path
    def read_file(self, filename, dir_path=None):
        """
        读取文件数据
        filedata = hd.read_file('README.txt', dir_path='/data')
        :param filename:
        :param dir_path:
        :return:
        """
        file_path = '{0}/{1}'.format(dir_path, filename)

        with self.client.read(file_path, encoding='utf-8') as reader:
            for line in reader:
                yield line

    @check_dir_path
    def delete(self, filename, dir_path=None):
        file_path = '{0}/{1}'.format(dir_path, filename)
        self.client.delete(file_path)
def uploadHDFS(filename):
    aoi_file = filename
    client = InsecureClient('http://10.41.158.65:50070', user='******')
    fname1 = client.list(hdfs_path)
    dt = str(aoi_file.split("@")[1].split("_")[0][:8])
    folder1 = dt
    if folder1 in fname1:
        client.upload(hdfs_path+folder1+"/"+aoi_file,upload_path+aoi_file,overwrite=True)
        shutil.move(upload_path + aoi_file, backup_path + aoi_file)
    else:
        client.makedirs(hdfs_path+folder1)
        client.upload(hdfs_path+folder1+"/"+aoi_file,upload_path+aoi_file,overwrite=True)
        shutil.move(upload_path + aoi_file, backup_path + aoi_file)
Example #18
0
def selectKudu():
    countusn = 0
    client = InsecureClient('http://10.41.158.65:50070', user='******')
    print("selectKudu:")
    tablename = "allie.aoi_imageurl"
    hostname1 = 'p8cdhdatap01.wzs.wistron'
    hostname2 = 'p8cdhdatap02.wzs.wistron'
    hostname3 = 'p8cdhdatap03.wzs.wistron'

    port = 21050
    valueslist_str = ' '
    valuelist = ' '

    fileslist = client.list('/P8AOI/MapData/X1726/20210322')
    for i in range(len(fileslist)):
        countusn += 1
        aoi_file = fileslist[i]
        aoi_file = rename(aoi_file)
        sn = getfilesn(aoi_file)
        # print("deleteKudu: " +aoi_file)
        if len(sn) > 10:
            if '@D@' in aoi_file:
                dt = aoi_file[aoi_file.find('@') + 3:aoi_file.find('@') + 11]
            elif '@' in aoi_file:
                dt = aoi_file[aoi_file.find('@') + 1:aoi_file.find('@') + 9]
            valuelist = '\'' + sn + '\''
        if i == 0:
            valueslist_str = valuelist
        else:
            valueslist_str += ',' + valuelist
        if (countusn >= 8000) or (i == (len(fileslist) - 1)):
            sql = "select * from " + tablename + " where usn in (" + valueslist_str + ")"
            os.system('echo ' + sql + ' >> C:/Users/Z18073047/Desktop/sql.txt')
            valueslist_str = "''"
            countusn = 0
            try:
                conn = connect(host=hostname1, port=port)
            except:
                time.sleep(2)
            try:
                conn = connect(host=hostname2, port=port)
            except:
                time.sleep(2)
                conn = connect(host=hostname3, port=port)
            finally:
                try:
                    cur = conn.cursor()
                    cur.execute(sql)
                except Exception as e:
                    print(e)
                conn.close()
Example #19
0
 def listdir_hdfs(self):
     index = 0
     self.list_display.setText('')
     ip_address = self.ip_input.toPlainText()
     port_number = self.port_input.toPlainText()
     user_name = self.user_input.toPlainText()
     dir_name = self.dir_input.toPlainText()
     host_address = 'http://' + ip_address + ':' + port_number
     hadoop = InsecureClient(host_address, user_name)
     directory = hadoop.list(dir_name, status=True)
     # self.list_display.append(directory[0][0]+':'+directory[0][1]['type'])
     for file in directory:
         display = 'Name:  ' + str(file[0]) + ' | Type:  ' + file[1]['type']
         self.list_display.append(display)
Example #20
0
def list(path):
    hdfs_client = InsecureClient(Utils.find_hdfs_namenode_address(),
                                 user=Utils.user)

    path += '/'
    relative_path = path
    if path.startswith('hdfs://'):
        pos = path.find('/', 7)
        relative_path = path[pos:]

    files = hdfs_client.list(relative_path)
    return [
        path + filename for filename in files if filename.startswith("part-")
    ]
    def post(self):
        gen_log.info(self.request.headers)
        gen_log.info(self.request.body)
        #data = self.get_all_request_arguments()
        job_id = self.get_request_argument('jid', None)
        gen_log.info(job_id)
        if job_id and utils.is_object_id(job_id):
            job = yield self.db.jobs.find_one({"_id": ObjectId(job_id)})
            from hdfs import InsecureClient
            hdfs_client = InsecureClient("http://169.24.2.194:50070",
                                         user='******')
            content = hdfs_client.list("/tmp")
            gen_log.info(content)
            content = hdfs_client.list("/tmp")
            gen_log.info(content)

            work_dir = os.path.join(UPLOAD_DIR, job.get('uuid', None))
            data_dir = os.path.join(work_dir, "data")
            model_dir = os.path.join(work_dir, "model")

            # 判断数据文件目录是否存在
            if not os.path.exists(data_dir):
                self.write_json("数据文件未上传,请上传数据文件", code=1)
                return

            # 判断模型文件目录是否存在
            if not os.path.exists(model_dir):
                self.write_json("模型文件未上传,请上传模型文件", code=1)
                return

            #开始上传数据文件
            remote_hdfs_data_dir, local_data_dir = job.get('input',
                                                           "").split("#")
            hdfs_client.upload(remote_hdfs_data_dir, data_dir, overwrite=True)

            # 切换到model目录
            os.chdir(model_dir)
Example #22
0
class DataProcessor:
    def __init__(self, data_path=None):
        if data_path == None:
            self.data_path = r'./config/connect_info.json'
        else:
            assert type(data_path) == str
            self.data_path = data_path
        if not os.path.exists(self.data_path):
            self.data_path = r'./connect_info.json'

        with open(self.data_path) as data_file:
            data = json.load(data_file)
            print("Data: ", data)
            self.hdfs_client = InsecureClient(
                url='http://' + data['namenode_url'] + ':' + str(data['port']),
                user=data['user'],
                root=data['root_path'])
            print("hdfs client: ", self.hdfs_client)
            self.img_dir = data['img_dir']
            print("img dir: ", self.img_dir)

        if self.img_dir[-1] != '/':
            self.img_dir += '/'
        else:
            pass

        self.file_name = 1

    def InitImgDir(self):
        try:
            list_rslt = self.hdfs_client.list(self.img_dir)
            if len(list_rslt) > 0:
                for name in list_rslt:
                    file_path = self.img_dir + name
                    self.hdfs_client.delete(file_path)

        except util.HdfsError:
            self.hdfs_client.makedirs(self.img_dir)

        return True

    def Upload(self, file_path, threads=2):
        print("FilePath: ", file_path)
        print("img_dir: ", self.img_dir[:-1])
        self.hdfs_client.upload(hdfs_path=self.img_dir[:-1],
                                local_path=file_path,
                                n_threads=threads,
                                overwrite=True)
        return 0
Example #23
0
def uploadHDFS():
    client = InsecureClient('http://10.41.158.65:50070', user='******')
    fname1 = client.list(hdfs_path)
    if rec_dat not in fname1:
        client.makedirs(hdfs_path + "/" + rec_dat)
    src = temp_path + '*'
    backup_path = '/bfdata/buffer/total_pre_arch/'
    dsc = hdfs_path + rec_dat + '/'
    print 'hdfs dfs -copyFromLocal ', src, dsc
    os.system(
        '/home/hadoop/wistron-hadoop/hadoop-2.7.1/bin/hdfs dfs -copyFromLocal '
        + src + ' ' + dsc)
    print 'mv -f ', temp_path + "*", backup_path
    os.system('mv -f ' + temp_path + "* " + backup_path)
    #os.system('/usr/bin/find ' + temp_path + ' -name *.JPG -exec mv {} ' + backup_path + ' \;')
    end_time = time.time()
    com_dat = datetime.datetime.now().strftime('%Y%m%d %H:%M:%S')
Example #24
0
    def get(self):
        
        # Récupération du Dataset pour l'évaluation
        df = get_data_cassandra()
        
        print(df.head())
        X = df['total_estimated_load'].values

        # evaluate parameters (p,d,q)  <=> (AR, I, MA)
        p_values = 7
        d_values = 0
        q_values = 5
        #best_cfg, best_score = evaluate_models(X, p_values, d_values, q_values)
        best_cfg = (p_values,d_values,q_values)
        
        # Entrainement du meilleur modèle
        model = ARIMA(X, order=best_cfg)
        model_fit = model.fit()
        
        # save model
        if not os.path.exists(model_local_path):
               # Création du dossier d'export local qui n'existe pas
               os.makedirs(model_local_path,exist_ok=False)
        
        model_fit.save(model_local_path + model_name)
            
        # Connexion au client HDFS
        client = InsecureClient(url='http://namenode:9870', user='******')
    
        # Création du dossier de stockage des fichiers traités
        if client.status(model_hdfs_remote_path,strict=False) == None:
                client.makedirs(model_hdfs_remote_path)

	# Copie du modèle sur HDFS
        remote_load_path = client.upload(model_hdfs_remote_path, model_local_path + model_name,overwrite=True)
        #print(remote_load_path)

        print(client.list(model_hdfs_remote_path))

	
        return { 'best_cfg': best_cfg , 'status': 'Terminated'}
def make_complete_path(path):
    """
    输入完整的hdfs路径,输出组装好的html组件
    """
    content = {}
    c = InsecureClient("http://master:50070", session["uid"])
    content["path"] = []
    for k in c.list(path, True):
        if k[1]["type"] == "FILE":
            content["path"].append(
                "<li class='is_file list_content list-group-item'><span class='file name'>"
                + k[0] + "</span><span class='badge'>" + k[1]["owner"] +
                "</span><span class='badge'>" +
                timeStamp(k[1]["modificationTime"]) + "</span></li>")
        else:
            content["path"].append(
                "<li class='is_list list_content list-group-item'><span class='list name'>"
                + k[0] + "</span><span class='badge'>" + k[1]["owner"] +
                "</span><span class='badge'>" +
                timeStamp(k[1]["modificationTime"]) + "</span></li>")
    return content
Example #26
0
    def launcher(self):
        """ Send remove checkpoints task """

        # Connect
        client = InsecureClient('http://{ip}:{port}'.format(
            ip=self.namenode_ip, port=self.namenode_port),
                                user=self.file_user)

        # Get current timestamp
        timenow = calendar.timegm(datetime.datetime.now().timetuple())
        unix_timestamp = int(timenow * 1000)
        onehour = 3600000
        todelete = int(unix_timestamp - onehour)

        # Return file name list
        for directory in self.directories:
            fnames = client.list(directory, status=True)

            # Fetch list and sets modificationTime
            for fname in fnames:
                ctime = fname[1]['modificationTime']
                if ctime <= todelete:
                    dirtodelete = fname[1]['pathSuffix']
                    client.delete('{directory}/{dirtodelete}'.format(
                        directory=directory, dirtodelete=dirtodelete),
                                  recursive=True)
                    l.info(
                        'Removing {dir} ...Removed!'.format(dir=dirtodelete))
                    message = self.deleteddirs.append(dirtodelete)
                else:
                    l.info(
                        'Nothing to remove into {directory}. Bye bye!'.format(
                            directory=directory))

        if message:
            stdout = message
        else:
            stdout = 'No directories were deleted.'

        return {'Deleted directories': stdout}
Example #27
0
def listFiles(storeConfig, name):
    fileList = list()
    local = storeConfig.getboolean('localStore')
    if (local):
        baseDir = storeConfig['baseDir']
        dirName = baseDir + name
        dirEntries = os.scandir(dirName)
        for dirEntry in dirEntries:
            if (dirEntry.isFile() and not dirEntry.startswith('.')):
                fileList.append(dirEntry)
    else:  #list files HDFS
        hdfsBaseDir = storeConfig['hdfsBaseDir']
        dirName = hdfsBaseDir + name
        hdfsUrl = storeConfig['hdfsUrl']
        hdfsClient = InsecureClient(hdfsUrl, user='******')
        try:
            fileList = hdfsClient.list(dirName)
        except:
            print(
                f"Got HDFS exception listing directory {dirName}, returning empty list"
            )
    return fileList
Example #28
0
class interHDFS:
    def __init__(self, url, user=None, **kwargs):
        self.url = url
        self.user = user
        for k, v in kwargs.items():
            self.k = v
        self.connect = InsecureClient(self.url, self.user)
        try:
            self.connect.status('/')
        except Exception as e:
            print(f"[ERROR]:")
            raise ("connected failed!")

    @property
    def apiVersion(self):
        return "v1"

    def listDir(self, dirname: str = '/'):
        return self.connect.list(dirname)

    def getFiles(self, dirname: str, depth: int = 0) -> list:
        l = []
        if not dirname:
            print("dirname is null")
        else:
            for file in self.connect.walk(dirname, depth=depth):
                if file[-1]:
                    for f in file[-1]:
                        l.append(file[0] + '/' + f)
            return l

    def downloadToCsv(self, filename: str) -> None:
        '''only split for the '€€' sign, and generate same filename in current directory'''
        with self.connect.read(filename, encoding='utf-8') as reader:
            with open(csvdir + filename.split('/')[-1].split('.')[0] + '.csv',
                      'a+') as cf:
                for line in reader.readlines():
                    newline = line.replace('€€', ',')
                    cf.write(newline)
Example #29
0
class HDFS(BaseRepository):
    def __init__(self, host: str, port, user: str):
        super().__init__()
        self.host = host
        self.port = port
        self.user = user
        self.prodcuer = None

    def connect(self):
        self.conn = InsecureClient(f"http://{self.host}:{self.port}",
                                   user=self.user)
        if os.environ.get("KAFKA_BOOTSTRAP", None):
            self.producer = KafkaProducer(bootstrap_servers=os.environ.get(
                "KAFAKA_BOOTSTRAP", "localhost:1234"))
        else:
            self.producer = None

    def disconnect(self):
        self.save_snapshot()
        if self.prodcuer:
            self.producer.close()

    def insert_rows(self, rows: list[(datetime, str, str, str, str, str)]):
        self.add_buff(rows)
        self.flush()

    def _last_datetime(self, category, date):
        if self.conn.status(f"/krwordcloud/add-article/{date}")['length'] == 0:
            return config.min_date
        tfname = ''
        with tempfile.NamedTemporaryFile("wb") as tf:
            tfname = tf.name
            with self.conn.read(f"/krwordcloud/add-article/{date}",
                                chunk_size=8096) as hf:
                for chunk in hf:
                    tf.write(chunk)
            with open(tfname, 'rb') as tf:
                reader = pyorc.Reader(tf)
                maximum = datetime.datetime \
                    .strptime(f"{date} GMT+0900", "%Y-%m-%d.orc GMT%z")
                for row in reader:
                    if row[0] > maximum and row[1] == category:
                        maximum = row[0]
                if (maximum < config.min_date):
                    return config.min_date
                elif maximum > datetime.datetime.now().replace(tzinfo=KST):
                    return datetime.datetime.now().replace(tzinfo=KST)
                else:
                    return maximum
        os.unlink(tfname)

    def make_entries(self):
        entries = dict()
        hdfs_entries = dict()
        lookup_hdfs = []

        self.load_snapshot()

        for category in config.categories:
            category_rows = list(
                filter(lambda row: row[1] == category, self.buff))
            if len(category_rows) > 0:
                last = max(category_rows, key=lambda row: row[0])
                entries[category] = last[0]
            else:
                lookup_hdfs.append(category)

        try:
            dates = self.conn.list("/krwordcloud/add-article/")
            if len(dates) > 0:
                for category in lookup_hdfs:
                    found = False
                    for last in reversed(dates):
                        try:
                            entries[category] = self._last_datetime(
                                category, last)
                            found = True
                            break
                        except Exception as e:
                            print(e)
                            continue
                    if found is False:
                        entries[category] = config.min_date
            else:
                hdfs_entries = dict.fromkeys(lookup_hdfs, config.min_date)
        except HdfsError:
            entries[category] = config.min_date
        except Exception as e:
            print(e)
        return {
            k: v
            for k, v in sorted({
                **entries,
                **hdfs_entries
            }.items(),
                               key=lambda item: item[1])
        }

    def save_snapshot(self):
        print('save_snapshot')
        with self.conn.write("/krwordcloud/snapshot.json",
                             overwrite=True,
                             encoding="utf-8") as f:
            data = list(
                map(lambda x: (x[0].isoformat(), x[1], x[2], x[3], x[4], x[5]),
                    self.buff))
            json.dump(data, f, ensure_ascii=False)

    def load_snapshot(self):
        print('load_snapshot')
        try:
            with self.conn.read("/krwordcloud/snapshot.json",
                                encoding="utf-8") as f:
                self.buff = list(
                    map(
                        lambda x:
                        (parser.parse(x[0]), x[1], x[2], x[3], x[4], x[5]),
                        json.load(f)))
        except Exception:
            self.buff = []

    def flush(self):
        dates = sorted(list(set(map(lambda row: row[0].date(), self.buff))))
        if len(dates) > 1:
            for d in dates[:-1]:
                data = list(filter(lambda row: row[0].date() == d, self.buff))
                if self.producer:
                    self._kafka_flush(d, data)
                else:
                    self._hdfs_flush(d, data)
            self.buff = list(
                filter(lambda row: row[0].date() == dates[-1], self.buff))
            self.save_snapshot()

    def _kafka_flush(self, date, data):
        self.producer.send(f"add-article-{date}", data)

    def _hdfs_flush(self, date, data):
        with self.conn.write(f"/krwordcloud/add-article/{date}.orc",
                             overwrite=True) as hf:
            tfname = ''
            with tempfile.NamedTemporaryFile(mode="wb+", delete=False) as tf:
                tfname = tf.name
                with pyorc.Writer(
                        tf,
                        schema="struct<field0:timestamp,field1:string," +
                        "field2:string,field3:string>",
                ) as of:
                    of.writerows(data)
            with open(tfname, 'rb') as tf:
                for line in tf:
                    hf.write(line)
            os.unlink(tfname)
Example #30
0
# encoding : utf8
from kafka import KafkaConsumer
from hdfs import InsecureClient
import bson
import time

# Init HDFS
client = InsecureClient('http://X:50070', user='******')
hdfs_file  = 'tweets.json'
# Create file if not exist
hdfs_files_list = client.list('')
if hdfs_file not in hdfs_files_list:
    with client.write(hdfs_file) as writer:
        writer.write('')

# Init kafka
consumer = KafkaConsumer('X', group_id='X_GRP',
                         bootstrap_servers='X:9092')

with client.write(hdfs_file, append=True) as writer:
    # New kafka message
    for msg in consumer:
        print time.strftime("%Y-%m-%d %H:%M:%S") + " [DEBUG] new tweet (consumer)"
        tweet = msg.value
        # Write message in HDFS
        writer.write(tweet)
def start_service():

    # Chemin de téléchargement du fichier des données
    file_path = "/home/formation/Downloads/hrl_load_estimated.csv"

    #Connexion au client HDFS
    client = InsecureClient(url='http://*****:*****@default-value='defaultStartDate']")))

            #print(driver.page_source)

            # Saisie des dates de début et de fin
            elem = driver.find_element_by_xpath(
                "//input[@default-value='defaultStartDate']")
            elem.clear()
            elem.send_keys("01/01/" + str(year))
            elem = driver.find_element_by_xpath(
                "//input[@default-value='defaultEndDate']")
            elem.clear()
            elem.send_keys("12/31/" + str(year))

            # Attente de rechargement de la page
            time.sleep(5)

            element = WebDriverWait(driver, 10).until(
                EC.presence_of_element_located(
                    (By.XPATH, "//button[text()='Submit']")))

            # Envoi du formulaire pour le rechargement des données pour la période demandée
            elem = driver.find_element_by_xpath("//button[text()='Submit']")
            elem.click()

            # Attente de l'afffichage du bouton d'export
            element = WebDriverWait(driver, 10).until(
                EC.presence_of_element_located((By.CLASS_NAME, "dm-download")))
            elem = driver.find_element_by_class_name("dm-download")
            elem.click()

            # Attendre le téléchargement du fichier
            while not os.path.exists(file_path):
                time.sleep(1)

            if os.path.isfile(file_path):
                print("Fichier téléchargé pour l'année {}".format(year))

                # Renommage du fichier
                new_file_name = file_path.replace(".csv",
                                                  "_" + str(year) + ".csv")
                os.rename(file_path, new_file_name)

                # Upload du fichier CSV local dans le système HDFS
                try:
                    remote_load_path = client.upload('/user/root/data/pjm',
                                                     new_file_name,
                                                     overwrite=True)
                    # print(remote_load_path)
                except:
                    print("error")

                print(client.list('/user/root/data/pjm'))

            else:
                raise ValueError("%s isn't a file!" % file_path)

        finally:
            #driver.quit()
            print("fin du traitement du fichier")

        #assert "No results found." not in driver.page_source
        driver.close()

        time.sleep(10)
Example #32
0
class HDFSStorage(Storage):
    """
    HDFS storage
    """

    def fix_slashes(self, path):
        sep = os.path.sep
        if path[0] != sep:
            path = sep + path
        if path[-1] != sep:
            path = path + sep
        return path

    def __init__(self, location=None, base_url=None):
        self.hdfs_hosts = settings.HDFS_STORAGE['hosts']
        self.hdfs_root = self.fix_slashes(settings.HDFS_STORAGE['root'])
        self.media_root = settings.MEDIA_ROOT
        self.media_url = self.fix_slashes(settings.MEDIA_URL)

        self.fetch_url = '%s/webhdfs/v1%s%%s?op=OPEN' % (self.hdfs_hosts.split(',')[0], self.hdfs_root)
        self.client = InsecureClient(self.hdfs_hosts)

    def _open(self, name, mode='rb'):
        local_path = os.path.join(settings.MEDIA_ROOT, name.replace('/', os.path.sep))
        if not os.path.exists(local_path):
            remote_path = self.path(name)
            local_dir = os.path.dirname(local_path)
            if not os.path.exists(local_dir):
                os.mkdir(local_dir)
            print self.client.download(remote_path, local_path=local_path, overwrite=True,
                                       temp_dir=tempfile.gettempdir())
        return File(open(local_path, mode))

    def _save(self, name, content):
        print "_save(%s, %s, %s)" % (self, name, content)
        local_path = content.name
        hdfs_path = self.path(name)  # os.path.basename(local_path))
        print hdfs_path, local_path
        self.client.write(hdfs_path, data=content, overwrite=True)
        return name

    def url(self, name):
        return self.fetch_url % name

    def delete(self, name):
        return self.client.delete(self.path(name))

    def listdir(self, path):
        file_list = []
        dir_list = []
        for name, status in self.client.list(self.path(path), status=True):
            if status['type'] == 'DIRECTORY':
                dir_list.append(name)
            elif status['type'] == 'FILE':
                file_list.append(name)
        return dir_list, file_list

    def size(self, name):
        return self.client.status(self.path(name))['length']

    def exists(self, name):
        try:
            return True if self.client.status(self.path(name)) else False
        except HdfsError:
            return False

    def path(self, name):
        return (self.hdfs_root + name).replace('\\', '/')