def find_hdfs_namenode_address(): for nd in namenodes: hdfs_client = InsecureClient(url=nd, user=user) try: hdfs_client.list('/') return nd except: continue raise Exception("No available name node.")
def create_csv(directory_to): hdfs_client = InsecureClient('http://192.168.1.4:9870', user='******') list_yes = hdfs_client.list('/' + directory_to + 'yes') list_images = ['yes/' + name for name in list_yes] list_no = hdfs_client.list('/' + directory_to + 'no/') list_images += ['no/' + name for name in list_no] data = pd.DataFrame(list_images, columns=['Path']) with hdfs_client.write('/' + directory_to + 'data.csv', encoding='utf-8') as writer: data.to_csv(writer, index_label='index')
def download(keyword): client = InsecureClient("http://ip_address", user="******") root_dir = "/username/dps" for folder in client.list(root_dir): if keyword not in folder: continue os.makedirs(os.path.join("data", folder), exist_ok=True) for file in client.list(root_dir + "/" + folder): target_path = os.path.join("data", folder, file) logging.info("Downloading for {}".format(target_path)) if os.path.exists(target_path): logging.warning("{} already exists!".format(target_path)) continue with open(target_path, "wb") as writer, client.read("{}/{}/{}".format(root_dir, folder, file)) as reader: writer.write(reader.read())
def renameFiles(ip='172.20.10.2', port='9870', username='******', MainName='result-part', SubName='.json', dirPath='/tmp/Cathay/'): client = InsecureClient("http://" + ip + ":" + port, user=username) if dirPath[-1] != '/': dirPath += '/' fns = client.list(dirPath) for fn in fns: if 'part-' in fn: num = str(int(fn.split('part-')[-1]) + 1) client.rename(dirPath + fn, dirPath + MainName + num + SubName) return str(fns) + "\n Change to \n" + str(client.list(dirPath))
def main(): client = InsecureClient(f'http://127.0.0.1:50070/', user='******') # create directory in HDFS client.makedirs('/test') #list content ll = client.list('/') print(ll) # create file in HDFS data = [{ "name": "Anne", "salary": 10000 }, { "name": "Victor", "salary": 9500 }] with client.write('/test/sample_file.json', encoding='utf-8') as json_file_in_hdfs: json.dump(data, json_file_in_hdfs) # OR client.write(os.path.join('/', 'test', 'sample_file2.json'), data=json.dumps(data), encoding='utf-8') # download file from HDFS client.download('/test/sample_file.json', './file_from_hadoop.json') # upload file to HDFS client.upload('/test/local_file_in_hadoop.json', './file_from_hadoop.json')
def uploadHDFS(filename): # if ' ' in filename: # aoi_file = rename(aoi_file) # else: aoi_file = filename client = InsecureClient('http://10.41.158.65:50070', user='******') fname1 = client.list(hdfs_path) # if aoi_file.split("@").count('-') >= 2: dt = str(aoi_file.split("@")[1].split("-", 1)[0][:8]) # else: # dt = str(aoi_file.split("_")[-1].split("-")[-2].split("@")[1][:8]) # dt = str(aoi_file.split("@")[1].split("-",1)[0][:8]) folder1 = dt if folder1 in fname1: client.upload(hdfs_path + folder1 + "/" + aoi_file, upload_path + aoi_file, overwrite=True) print "uploadHDFS ok" shutil.move(upload_path + aoi_file, backup_path + aoi_file) else: client.makedirs(hdfs_path + folder1) client.upload(hdfs_path + folder1 + "/" + aoi_file, upload_path + aoi_file, overwrite=True) print "uploadHDFS ok" shutil.move(upload_path + aoi_file, backup_path + aoi_file)
class HdfsClient: def __init__(self, namenode_host, datanode_host): self._namenode_host = namenode_host self._datanode_host = datanode_host self._client = InsecureClient(f'http://{self._namenode_host}:9870') self._logger = logging.getLogger(__name__) self._logger.setLevel( logging.getLevelName(os.getenv("LOG_LEVEL", "INFO"))) ch = logging.StreamHandler() ch.setLevel(logging.getLevelName(os.getenv("LOG_LEVEL", "INFO"))) self._logger.addHandler(ch) def list(self, path): return self._client.list(path) def get_stream(self, hdfs_path): request_path = f'http://{self._datanode_host}:9864/webhdfs/v1{hdfs_path}?op=OPEN&namenoderpcaddress={self._namenode_host}:9000&offset=0' return HdfsClientGetStream(request_path) def upload_to_hdfs(self, local_path, remote_path): self._logger.info(f'Upload local path {local_path} to {remote_path}') with open(local_path, 'rb') as f: r = requests.put( f'http://{self._namenode_host}:9870/webhdfs/v1{remote_path}?op=CREATE&overwrite=true', data=f) self._logger.debug(f'Upload result {r.content}')
class Storage: def __init__(self, protocol: str = 'webHDFS', *args, **kwargs): self.protocol, self.client = protocol.lower(), None if protocol.lower() == 'webHDFS'.lower(): from hdfs import InsecureClient self.client = InsecureClient(*args, **kwargs) for f in 'upload download list status delete'.split(): setattr(self, f, getattr(self, '%s_%s' % (f, protocol.lower()))) def upload_webhdfs(self, local_path: str, remote_path: str, **kwargs): to_screen("upload %s -> %s" % (local_path, remote_path)) return self.client.upload(local_path=local_path, hdfs_path=remote_path, **kwargs) def download_webhdfs(self, remote_path: str, local_path: str, **kwargs): mkdir_for(local_path) to_screen("download %s -> %s" % (remote_path, local_path)) return self.client.download(local_path=local_path, hdfs_path=remote_path, overwrite=True, **kwargs) def list_webhdfs(self, remote_path: str, **kwargs): return self.client.list(hdfs_path=remote_path, **kwargs) def status_webhdfs(self, remote_path: str, **kwargs): return self.client.status(hdfs_path=remote_path, **kwargs) def delete_webhdfs(self, remote_path: str, **kwargs): return self.client.delete(hdfs_path=remote_path, **kwargs)
class HDFSService(object): def __init__(self): self.hdfs = InsecureClient('http://127.0.0.1:9870', user='******') self.base_path = '/users/root' def mkdir(self, path): return self.hdfs.makedirs(path) def list(self, path): try: return self.hdfs.list(path) except HdfsError as e: print(e) return [] def get(self, path): pass def upload(self, path, local_path=None, data=None): path = self.base_path + path if data is not None: return self.hdfs.write(path, data=data) elif local_path is not None: return self.hdfs.upload(path, local_path) return False pass def download(self, path): path = self.base_path + path with self.hdfs.read(path) as reader: print(path) buf = reader.read() print(len(buf)) return buf
class DataProcessor: def __init__(self, data_path=None): if data_path is None: self.data_path = r'./config/connect_info.json' else: assert type(data_path) == str self.data_path = data_path if not os.path.exists(self.data_path): self.data_path = r'./connect_info.json' with open(self.data_path) as data_file: data = json.load(data_file) self.hdfs_client = InsecureClient( url='http://' + data['namenode_url'] + ':' + str(data['port']), user=data['user'], root=data['root_path']) self.img_dir = data['img_dir'] if self.img_dir[-1] != '/': self.img_dir += '/' self.file_name = 1 def InitImgDir(self): try: list_rslt = self.hdfs_client.list(self.img_dir) if len(list_rslt) > 0: for name in list_rslt: file_path = self.img_dir + name self.hdfs_client.delete(file_path) except util.HdfsError: self.hdfs_client.makedirs(self.img_dir) print("Mkdir ...") return True def DataProcess(self, data, append=False, file_name=None): assert type(data) == str if file_name is None: file_name = self.img_dir + str(self.file_name) else: assert (type(file_name)) == str print("start writing...") start = time.time() self.hdfs_client.write(file_name, data, overwrite=True, replication=1, append=append) delta = time.time() - start print("writing complete, time delta is " + str(delta)) return True def Upload(self, remote_name, local_path): assert os.path.exists(local_path) remote_path = self.img_dir + remote_name self.hdfs_client.upload(remote_path, local_path, True) return True
class HdfsWrapper: def __init__(self): self.client = None def connect_hdfs(self): self.client = InsecureClient(CONST.HDFS_URL, user=CONST.HDFS_USER) def mkdir_hdfs(self, path): if not exists(path): self.client.makedirs(path) def list_hdfs(self, path): return self.client.list(path) def read_hdfs(self, hdfs_path): try: with self.client.read(hdfs_path) as reader: return reader.read() except: log.error(traceback.format_exc()) self.connect_hdfs() log.error('reconnect hdfs...') def write_hdfs(self, hdfs_path, data, overwrite=False): try: with self.client.write(hdfs_path, overwrite=overwrite) as writer: writer.write(data) return hdfs_path except: log.error(traceback.format_exc()) self.connect_hdfs() log.error('reconnect hdfs...') def delete_hdfs(self, hdfs_path, recursive=False): return self.client.delete(hdfs_path, recursive)
def delJPG_Newmodel(basepath): now_time = datetime.datetime.now() now_date_str = now_time.strftime('%Y%m%d') now_date = datetime.datetime.strptime(now_date_str, '%Y%m%d') try: # client = Client('http://10.41.158.72:50070') # client = InsecureClient('http://10.41.158.106:50075', user='******') client = InsecureClient('http://10.41.158.65:50070', user='******') # path="/P8AOI" # path1="C:/Users/z18073048/Desktop/bigdata/X1778-ANSI-BOT_20200813_TB1-F11-TRI-05@20200813094718-FPW03354EX3P49WBS.JPG" # client.upload(path,path1,cleanup=True) folderlist = client.list(basepath) newmodel = getnewmodel() for i in range(len(folderlist)): if isinstance(folderlist[i], unicode): #if isinstance(folderlist[i],list): folderlist[i] = folderlist[i].decode('string_escape') fname = folderlist[i] #print folderlist[i] #if (fname=='X1777' or fname=='X1778' or fname=='Errormodel'): if (fname in newmodel): folderlist1 = client.list(basepath + '/' + fname) print 'newmodel:', folderlist[i], folderlist1 for i in range(len(folderlist1)): if isinstance(folderlist1[i], list): folderlist1[i] = folderlist1[i].decode('string_escape') date_flag = is_valid_date(folderlist1[i]) #print date_flag if date_flag == 'true': folderItem = datetime.datetime.strptime( folderlist1[i], '%Y%m%d') if folderItem + datetime.timedelta( days=365) <= now_date: paths = basepath + fname + '/' + folderlist1[i] delHbase(folderlist1[i], client, paths) deleteKudu(folderlist1[i], client, paths) try: client.delete(paths, recursive=True) print paths + ' is delete' except Exception as e: print e except Exception as e: print e
def load_file_list_from_hdfs(data_package): if log_type in ['time','all']: st = time.time() hdfs_str = data_package.stream_hdfs_file_name hdfs_addr = hdfs_str[:hdfs_str.rfind('0/')+1] hdfs_path = hdfs_str[hdfs_str.rfind('0/')+2:] client = InsecureClient(hdfs_addr, user=getpass.getuser()) return client.list(hdfs_path), hdfs_path
def read_by_small(): client = InsecureClient(HDFS_URL, user=HDFS_USERNAME) files_list = client.list(HDFS_DIR) images = [] for fn in files_list: with client.read(hdfs_path=os.path.join(HDFS_DIR, fn)) as reader: img = reader.read() images.append(img) print(len(img))
def load_enedis(): client = InsecureClient('http://localhost:50070', user='******') client.makedirs('data') print(client.list('/user/cloudera')) # load 10 lignes client.upload( '/user/cloudera/data', '/home/fitec/projet_fil_rouge/source_des_données/data/consommation_elec_regions_2019_l10.json', overwrite=True)
class HdfsDb(object): HOST = '192.168.71.156' PORT = 50070 USER = '******' HOST_URI = 'http://{0}:{1}'.format(HOST, PORT) def __init__(self): self.client = InsecureClient(self.HOST_URI, user=self.USER) @check_dir_path def list_dir(self, dir_path=None): """ 列出根目录 :return: """ dir_data = self.client.list(dir_path) return dir_data @check_dir_path def mk_dir(self, dir_path=None): self.client.makedirs(dir_path) def write_file(self, filename, data, dir_path=None): """ 写入文件 hd.write_file('test.json', {'name': 'zhexiao'}, dir_path='/data') :param filename: :param data: :param dir_path: :return: """ file_path = '{0}/{1}'.format(dir_path, filename) self.client.write(file_path, str(data)) @check_dir_path def read_file(self, filename, dir_path=None): """ 读取文件数据 filedata = hd.read_file('README.txt', dir_path='/data') :param filename: :param dir_path: :return: """ file_path = '{0}/{1}'.format(dir_path, filename) with self.client.read(file_path, encoding='utf-8') as reader: for line in reader: yield line @check_dir_path def delete(self, filename, dir_path=None): file_path = '{0}/{1}'.format(dir_path, filename) self.client.delete(file_path)
def uploadHDFS(filename): aoi_file = filename client = InsecureClient('http://10.41.158.65:50070', user='******') fname1 = client.list(hdfs_path) dt = str(aoi_file.split("@")[1].split("_")[0][:8]) folder1 = dt if folder1 in fname1: client.upload(hdfs_path+folder1+"/"+aoi_file,upload_path+aoi_file,overwrite=True) shutil.move(upload_path + aoi_file, backup_path + aoi_file) else: client.makedirs(hdfs_path+folder1) client.upload(hdfs_path+folder1+"/"+aoi_file,upload_path+aoi_file,overwrite=True) shutil.move(upload_path + aoi_file, backup_path + aoi_file)
def selectKudu(): countusn = 0 client = InsecureClient('http://10.41.158.65:50070', user='******') print("selectKudu:") tablename = "allie.aoi_imageurl" hostname1 = 'p8cdhdatap01.wzs.wistron' hostname2 = 'p8cdhdatap02.wzs.wistron' hostname3 = 'p8cdhdatap03.wzs.wistron' port = 21050 valueslist_str = ' ' valuelist = ' ' fileslist = client.list('/P8AOI/MapData/X1726/20210322') for i in range(len(fileslist)): countusn += 1 aoi_file = fileslist[i] aoi_file = rename(aoi_file) sn = getfilesn(aoi_file) # print("deleteKudu: " +aoi_file) if len(sn) > 10: if '@D@' in aoi_file: dt = aoi_file[aoi_file.find('@') + 3:aoi_file.find('@') + 11] elif '@' in aoi_file: dt = aoi_file[aoi_file.find('@') + 1:aoi_file.find('@') + 9] valuelist = '\'' + sn + '\'' if i == 0: valueslist_str = valuelist else: valueslist_str += ',' + valuelist if (countusn >= 8000) or (i == (len(fileslist) - 1)): sql = "select * from " + tablename + " where usn in (" + valueslist_str + ")" os.system('echo ' + sql + ' >> C:/Users/Z18073047/Desktop/sql.txt') valueslist_str = "''" countusn = 0 try: conn = connect(host=hostname1, port=port) except: time.sleep(2) try: conn = connect(host=hostname2, port=port) except: time.sleep(2) conn = connect(host=hostname3, port=port) finally: try: cur = conn.cursor() cur.execute(sql) except Exception as e: print(e) conn.close()
def listdir_hdfs(self): index = 0 self.list_display.setText('') ip_address = self.ip_input.toPlainText() port_number = self.port_input.toPlainText() user_name = self.user_input.toPlainText() dir_name = self.dir_input.toPlainText() host_address = 'http://' + ip_address + ':' + port_number hadoop = InsecureClient(host_address, user_name) directory = hadoop.list(dir_name, status=True) # self.list_display.append(directory[0][0]+':'+directory[0][1]['type']) for file in directory: display = 'Name: ' + str(file[0]) + ' | Type: ' + file[1]['type'] self.list_display.append(display)
def list(path): hdfs_client = InsecureClient(Utils.find_hdfs_namenode_address(), user=Utils.user) path += '/' relative_path = path if path.startswith('hdfs://'): pos = path.find('/', 7) relative_path = path[pos:] files = hdfs_client.list(relative_path) return [ path + filename for filename in files if filename.startswith("part-") ]
def post(self): gen_log.info(self.request.headers) gen_log.info(self.request.body) #data = self.get_all_request_arguments() job_id = self.get_request_argument('jid', None) gen_log.info(job_id) if job_id and utils.is_object_id(job_id): job = yield self.db.jobs.find_one({"_id": ObjectId(job_id)}) from hdfs import InsecureClient hdfs_client = InsecureClient("http://169.24.2.194:50070", user='******') content = hdfs_client.list("/tmp") gen_log.info(content) content = hdfs_client.list("/tmp") gen_log.info(content) work_dir = os.path.join(UPLOAD_DIR, job.get('uuid', None)) data_dir = os.path.join(work_dir, "data") model_dir = os.path.join(work_dir, "model") # 判断数据文件目录是否存在 if not os.path.exists(data_dir): self.write_json("数据文件未上传,请上传数据文件", code=1) return # 判断模型文件目录是否存在 if not os.path.exists(model_dir): self.write_json("模型文件未上传,请上传模型文件", code=1) return #开始上传数据文件 remote_hdfs_data_dir, local_data_dir = job.get('input', "").split("#") hdfs_client.upload(remote_hdfs_data_dir, data_dir, overwrite=True) # 切换到model目录 os.chdir(model_dir)
class DataProcessor: def __init__(self, data_path=None): if data_path == None: self.data_path = r'./config/connect_info.json' else: assert type(data_path) == str self.data_path = data_path if not os.path.exists(self.data_path): self.data_path = r'./connect_info.json' with open(self.data_path) as data_file: data = json.load(data_file) print("Data: ", data) self.hdfs_client = InsecureClient( url='http://' + data['namenode_url'] + ':' + str(data['port']), user=data['user'], root=data['root_path']) print("hdfs client: ", self.hdfs_client) self.img_dir = data['img_dir'] print("img dir: ", self.img_dir) if self.img_dir[-1] != '/': self.img_dir += '/' else: pass self.file_name = 1 def InitImgDir(self): try: list_rslt = self.hdfs_client.list(self.img_dir) if len(list_rslt) > 0: for name in list_rslt: file_path = self.img_dir + name self.hdfs_client.delete(file_path) except util.HdfsError: self.hdfs_client.makedirs(self.img_dir) return True def Upload(self, file_path, threads=2): print("FilePath: ", file_path) print("img_dir: ", self.img_dir[:-1]) self.hdfs_client.upload(hdfs_path=self.img_dir[:-1], local_path=file_path, n_threads=threads, overwrite=True) return 0
def uploadHDFS(): client = InsecureClient('http://10.41.158.65:50070', user='******') fname1 = client.list(hdfs_path) if rec_dat not in fname1: client.makedirs(hdfs_path + "/" + rec_dat) src = temp_path + '*' backup_path = '/bfdata/buffer/total_pre_arch/' dsc = hdfs_path + rec_dat + '/' print 'hdfs dfs -copyFromLocal ', src, dsc os.system( '/home/hadoop/wistron-hadoop/hadoop-2.7.1/bin/hdfs dfs -copyFromLocal ' + src + ' ' + dsc) print 'mv -f ', temp_path + "*", backup_path os.system('mv -f ' + temp_path + "* " + backup_path) #os.system('/usr/bin/find ' + temp_path + ' -name *.JPG -exec mv {} ' + backup_path + ' \;') end_time = time.time() com_dat = datetime.datetime.now().strftime('%Y%m%d %H:%M:%S')
def get(self): # Récupération du Dataset pour l'évaluation df = get_data_cassandra() print(df.head()) X = df['total_estimated_load'].values # evaluate parameters (p,d,q) <=> (AR, I, MA) p_values = 7 d_values = 0 q_values = 5 #best_cfg, best_score = evaluate_models(X, p_values, d_values, q_values) best_cfg = (p_values,d_values,q_values) # Entrainement du meilleur modèle model = ARIMA(X, order=best_cfg) model_fit = model.fit() # save model if not os.path.exists(model_local_path): # Création du dossier d'export local qui n'existe pas os.makedirs(model_local_path,exist_ok=False) model_fit.save(model_local_path + model_name) # Connexion au client HDFS client = InsecureClient(url='http://namenode:9870', user='******') # Création du dossier de stockage des fichiers traités if client.status(model_hdfs_remote_path,strict=False) == None: client.makedirs(model_hdfs_remote_path) # Copie du modèle sur HDFS remote_load_path = client.upload(model_hdfs_remote_path, model_local_path + model_name,overwrite=True) #print(remote_load_path) print(client.list(model_hdfs_remote_path)) return { 'best_cfg': best_cfg , 'status': 'Terminated'}
def make_complete_path(path): """ 输入完整的hdfs路径,输出组装好的html组件 """ content = {} c = InsecureClient("http://master:50070", session["uid"]) content["path"] = [] for k in c.list(path, True): if k[1]["type"] == "FILE": content["path"].append( "<li class='is_file list_content list-group-item'><span class='file name'>" + k[0] + "</span><span class='badge'>" + k[1]["owner"] + "</span><span class='badge'>" + timeStamp(k[1]["modificationTime"]) + "</span></li>") else: content["path"].append( "<li class='is_list list_content list-group-item'><span class='list name'>" + k[0] + "</span><span class='badge'>" + k[1]["owner"] + "</span><span class='badge'>" + timeStamp(k[1]["modificationTime"]) + "</span></li>") return content
def launcher(self): """ Send remove checkpoints task """ # Connect client = InsecureClient('http://{ip}:{port}'.format( ip=self.namenode_ip, port=self.namenode_port), user=self.file_user) # Get current timestamp timenow = calendar.timegm(datetime.datetime.now().timetuple()) unix_timestamp = int(timenow * 1000) onehour = 3600000 todelete = int(unix_timestamp - onehour) # Return file name list for directory in self.directories: fnames = client.list(directory, status=True) # Fetch list and sets modificationTime for fname in fnames: ctime = fname[1]['modificationTime'] if ctime <= todelete: dirtodelete = fname[1]['pathSuffix'] client.delete('{directory}/{dirtodelete}'.format( directory=directory, dirtodelete=dirtodelete), recursive=True) l.info( 'Removing {dir} ...Removed!'.format(dir=dirtodelete)) message = self.deleteddirs.append(dirtodelete) else: l.info( 'Nothing to remove into {directory}. Bye bye!'.format( directory=directory)) if message: stdout = message else: stdout = 'No directories were deleted.' return {'Deleted directories': stdout}
def listFiles(storeConfig, name): fileList = list() local = storeConfig.getboolean('localStore') if (local): baseDir = storeConfig['baseDir'] dirName = baseDir + name dirEntries = os.scandir(dirName) for dirEntry in dirEntries: if (dirEntry.isFile() and not dirEntry.startswith('.')): fileList.append(dirEntry) else: #list files HDFS hdfsBaseDir = storeConfig['hdfsBaseDir'] dirName = hdfsBaseDir + name hdfsUrl = storeConfig['hdfsUrl'] hdfsClient = InsecureClient(hdfsUrl, user='******') try: fileList = hdfsClient.list(dirName) except: print( f"Got HDFS exception listing directory {dirName}, returning empty list" ) return fileList
class interHDFS: def __init__(self, url, user=None, **kwargs): self.url = url self.user = user for k, v in kwargs.items(): self.k = v self.connect = InsecureClient(self.url, self.user) try: self.connect.status('/') except Exception as e: print(f"[ERROR]:") raise ("connected failed!") @property def apiVersion(self): return "v1" def listDir(self, dirname: str = '/'): return self.connect.list(dirname) def getFiles(self, dirname: str, depth: int = 0) -> list: l = [] if not dirname: print("dirname is null") else: for file in self.connect.walk(dirname, depth=depth): if file[-1]: for f in file[-1]: l.append(file[0] + '/' + f) return l def downloadToCsv(self, filename: str) -> None: '''only split for the '€€' sign, and generate same filename in current directory''' with self.connect.read(filename, encoding='utf-8') as reader: with open(csvdir + filename.split('/')[-1].split('.')[0] + '.csv', 'a+') as cf: for line in reader.readlines(): newline = line.replace('€€', ',') cf.write(newline)
class HDFS(BaseRepository): def __init__(self, host: str, port, user: str): super().__init__() self.host = host self.port = port self.user = user self.prodcuer = None def connect(self): self.conn = InsecureClient(f"http://{self.host}:{self.port}", user=self.user) if os.environ.get("KAFKA_BOOTSTRAP", None): self.producer = KafkaProducer(bootstrap_servers=os.environ.get( "KAFAKA_BOOTSTRAP", "localhost:1234")) else: self.producer = None def disconnect(self): self.save_snapshot() if self.prodcuer: self.producer.close() def insert_rows(self, rows: list[(datetime, str, str, str, str, str)]): self.add_buff(rows) self.flush() def _last_datetime(self, category, date): if self.conn.status(f"/krwordcloud/add-article/{date}")['length'] == 0: return config.min_date tfname = '' with tempfile.NamedTemporaryFile("wb") as tf: tfname = tf.name with self.conn.read(f"/krwordcloud/add-article/{date}", chunk_size=8096) as hf: for chunk in hf: tf.write(chunk) with open(tfname, 'rb') as tf: reader = pyorc.Reader(tf) maximum = datetime.datetime \ .strptime(f"{date} GMT+0900", "%Y-%m-%d.orc GMT%z") for row in reader: if row[0] > maximum and row[1] == category: maximum = row[0] if (maximum < config.min_date): return config.min_date elif maximum > datetime.datetime.now().replace(tzinfo=KST): return datetime.datetime.now().replace(tzinfo=KST) else: return maximum os.unlink(tfname) def make_entries(self): entries = dict() hdfs_entries = dict() lookup_hdfs = [] self.load_snapshot() for category in config.categories: category_rows = list( filter(lambda row: row[1] == category, self.buff)) if len(category_rows) > 0: last = max(category_rows, key=lambda row: row[0]) entries[category] = last[0] else: lookup_hdfs.append(category) try: dates = self.conn.list("/krwordcloud/add-article/") if len(dates) > 0: for category in lookup_hdfs: found = False for last in reversed(dates): try: entries[category] = self._last_datetime( category, last) found = True break except Exception as e: print(e) continue if found is False: entries[category] = config.min_date else: hdfs_entries = dict.fromkeys(lookup_hdfs, config.min_date) except HdfsError: entries[category] = config.min_date except Exception as e: print(e) return { k: v for k, v in sorted({ **entries, **hdfs_entries }.items(), key=lambda item: item[1]) } def save_snapshot(self): print('save_snapshot') with self.conn.write("/krwordcloud/snapshot.json", overwrite=True, encoding="utf-8") as f: data = list( map(lambda x: (x[0].isoformat(), x[1], x[2], x[3], x[4], x[5]), self.buff)) json.dump(data, f, ensure_ascii=False) def load_snapshot(self): print('load_snapshot') try: with self.conn.read("/krwordcloud/snapshot.json", encoding="utf-8") as f: self.buff = list( map( lambda x: (parser.parse(x[0]), x[1], x[2], x[3], x[4], x[5]), json.load(f))) except Exception: self.buff = [] def flush(self): dates = sorted(list(set(map(lambda row: row[0].date(), self.buff)))) if len(dates) > 1: for d in dates[:-1]: data = list(filter(lambda row: row[0].date() == d, self.buff)) if self.producer: self._kafka_flush(d, data) else: self._hdfs_flush(d, data) self.buff = list( filter(lambda row: row[0].date() == dates[-1], self.buff)) self.save_snapshot() def _kafka_flush(self, date, data): self.producer.send(f"add-article-{date}", data) def _hdfs_flush(self, date, data): with self.conn.write(f"/krwordcloud/add-article/{date}.orc", overwrite=True) as hf: tfname = '' with tempfile.NamedTemporaryFile(mode="wb+", delete=False) as tf: tfname = tf.name with pyorc.Writer( tf, schema="struct<field0:timestamp,field1:string," + "field2:string,field3:string>", ) as of: of.writerows(data) with open(tfname, 'rb') as tf: for line in tf: hf.write(line) os.unlink(tfname)
# encoding : utf8 from kafka import KafkaConsumer from hdfs import InsecureClient import bson import time # Init HDFS client = InsecureClient('http://X:50070', user='******') hdfs_file = 'tweets.json' # Create file if not exist hdfs_files_list = client.list('') if hdfs_file not in hdfs_files_list: with client.write(hdfs_file) as writer: writer.write('') # Init kafka consumer = KafkaConsumer('X', group_id='X_GRP', bootstrap_servers='X:9092') with client.write(hdfs_file, append=True) as writer: # New kafka message for msg in consumer: print time.strftime("%Y-%m-%d %H:%M:%S") + " [DEBUG] new tweet (consumer)" tweet = msg.value # Write message in HDFS writer.write(tweet)
def start_service(): # Chemin de téléchargement du fichier des données file_path = "/home/formation/Downloads/hrl_load_estimated.csv" #Connexion au client HDFS client = InsecureClient(url='http://*****:*****@default-value='defaultStartDate']"))) #print(driver.page_source) # Saisie des dates de début et de fin elem = driver.find_element_by_xpath( "//input[@default-value='defaultStartDate']") elem.clear() elem.send_keys("01/01/" + str(year)) elem = driver.find_element_by_xpath( "//input[@default-value='defaultEndDate']") elem.clear() elem.send_keys("12/31/" + str(year)) # Attente de rechargement de la page time.sleep(5) element = WebDriverWait(driver, 10).until( EC.presence_of_element_located( (By.XPATH, "//button[text()='Submit']"))) # Envoi du formulaire pour le rechargement des données pour la période demandée elem = driver.find_element_by_xpath("//button[text()='Submit']") elem.click() # Attente de l'afffichage du bouton d'export element = WebDriverWait(driver, 10).until( EC.presence_of_element_located((By.CLASS_NAME, "dm-download"))) elem = driver.find_element_by_class_name("dm-download") elem.click() # Attendre le téléchargement du fichier while not os.path.exists(file_path): time.sleep(1) if os.path.isfile(file_path): print("Fichier téléchargé pour l'année {}".format(year)) # Renommage du fichier new_file_name = file_path.replace(".csv", "_" + str(year) + ".csv") os.rename(file_path, new_file_name) # Upload du fichier CSV local dans le système HDFS try: remote_load_path = client.upload('/user/root/data/pjm', new_file_name, overwrite=True) # print(remote_load_path) except: print("error") print(client.list('/user/root/data/pjm')) else: raise ValueError("%s isn't a file!" % file_path) finally: #driver.quit() print("fin du traitement du fichier") #assert "No results found." not in driver.page_source driver.close() time.sleep(10)
class HDFSStorage(Storage): """ HDFS storage """ def fix_slashes(self, path): sep = os.path.sep if path[0] != sep: path = sep + path if path[-1] != sep: path = path + sep return path def __init__(self, location=None, base_url=None): self.hdfs_hosts = settings.HDFS_STORAGE['hosts'] self.hdfs_root = self.fix_slashes(settings.HDFS_STORAGE['root']) self.media_root = settings.MEDIA_ROOT self.media_url = self.fix_slashes(settings.MEDIA_URL) self.fetch_url = '%s/webhdfs/v1%s%%s?op=OPEN' % (self.hdfs_hosts.split(',')[0], self.hdfs_root) self.client = InsecureClient(self.hdfs_hosts) def _open(self, name, mode='rb'): local_path = os.path.join(settings.MEDIA_ROOT, name.replace('/', os.path.sep)) if not os.path.exists(local_path): remote_path = self.path(name) local_dir = os.path.dirname(local_path) if not os.path.exists(local_dir): os.mkdir(local_dir) print self.client.download(remote_path, local_path=local_path, overwrite=True, temp_dir=tempfile.gettempdir()) return File(open(local_path, mode)) def _save(self, name, content): print "_save(%s, %s, %s)" % (self, name, content) local_path = content.name hdfs_path = self.path(name) # os.path.basename(local_path)) print hdfs_path, local_path self.client.write(hdfs_path, data=content, overwrite=True) return name def url(self, name): return self.fetch_url % name def delete(self, name): return self.client.delete(self.path(name)) def listdir(self, path): file_list = [] dir_list = [] for name, status in self.client.list(self.path(path), status=True): if status['type'] == 'DIRECTORY': dir_list.append(name) elif status['type'] == 'FILE': file_list.append(name) return dir_list, file_list def size(self, name): return self.client.status(self.path(name))['length'] def exists(self, name): try: return True if self.client.status(self.path(name)) else False except HdfsError: return False def path(self, name): return (self.hdfs_root + name).replace('\\', '/')