def Run(jobTime): Client = HdfsClient(hosts=HDFS) Log = LogTime(jobTime) if Client.exists(Log.HPath + Log.success): r.rset(Rkey,jobTime,1) logger.info("-------- %s -----------" % jobTime) for dm in Domains: Files = Log.HPath + dm + Log.LogName if Client.exists(Files): Dfile = DOWN_DIR + '/' + dm + Log.UpFile TmpFile = dm + Log.UpTmp() try: sts = time.time() logger.info('DownloadStart... %s' % Files) Client.copy_to_local(Files,Dfile) logger.info('DownloadSuccess... %s %s' % (Files,Dfile)) Upload(TmpFile,Dfile,Log.PathDay) Rename(Dday + "/" + TmpFile) logger.info('UploadSuccess... %s' % Dfile) ets = time.time() Write(jobTime,dm,sts,ets,200) r.rdel(Rkey,jobTime) except Exception,e: Write(jobTime,dm,500) r.rdel(Rkey,jobTime) logger.error(e) else: logger.warn(Files + ' Non-existent') continue r.rdel(Rkey,jobTime)
def ProcAll(LocalDir, HdfsDir): NameNode = GolobalConfig['hdfs']['NameNode'] UserName = GolobalConfig['hdfs']['UserName'] client = HdfsClient(hosts=NameNode, user_name=UserName) if not client.exists(HdfsDir): client.mkdirs(HdfsDir) total = len(os.listdir(LocalDir)) processed = 0 failedList = list() FileSize = 0 StartTime = time.time() for filename in os.listdir(LocalDir): srcFile = os.path.join(LocalDir, filename) dstFile = HdfsDir + '/' + filename if not ProcOne(client, srcFile, dstFile): failedList.append(srcFile) else: FileSize += os.path.getsize(srcFile) processed += 1 print('%d/%d/%d, time cost: %.2f s' % (total, processed, len(failedList), time.time() - StartTime)) print('%d B, %.2f MB/s \n' % (FileSize, FileSize / 1024 / 1024 / (time.time() - StartTime))) if failedList: print('failedList: %s' % repr(failedList)) return False else: print('Good! No Error!') print('%d B, %.2f MB, %.2f GB, %.2f MB/s' % \ (FileSize, FileSize/1024/1024, FileSize/1024/1024/1024, FileSize/1024/1024/(time.time()-StartTime))) return True
class HDFSSErvice: namenode_host = "localhost" namenode_port = "9870" root_folder = "/" chunck_size = 100000 def __init__(self): self._client = HdfsClient(hosts=self.namenode_host + ":" + self.namenode_port, user_name="root") def get(self, hdfs_path: str): file_size = self.get_file_size(hdfs_path) for i in range(0, file_size, self.chunck_size): file_response = self._client.open(hdfs_path, offset=i, length=i + self.chunck_size) yield file_response.read() def append(self, hdfs_path: str, data: bytes): self.create_if_not_exist(hdfs_path) self._client.append(hdfs_path, data) def create_if_not_exist(self, hdfs_path: str): if not self._client.exists(hdfs_path): self._client.create(hdfs_path, b"") def get_messages_number(self, hdfs_path: str): return int(self.get_file_size(hdfs_path) / self.chunck_size + 1) def get_file_size(self, hdfs_path): file_infos = self._client.get_content_summary(hdfs_path) return file_infos.length def test(self): pass
def update_csv(): local = '/Users/constantine/PycharmProjects/test02/data.csv' tmpLocal = '/Users/constantine/PycharmProjects/test02/tmpdata.csv' remote = '/data/data.csv' host = '127.0.0.1:9870' user_name = 'host' client = HdfsClient(hosts=host,user_name=user_name) if client.exists(remote): client.copy_to_local(remote,tmpLocal) client.delete(remote) fRead = open(local,'r') fWrite = open(tmpLocal,'w') lines = fRead.readlines() for line in lines: fWrite.writelines(lines) fRead.close() fWrite.close() fRead = open(local, 'r') lines = fRead.read() fRead.close() fWrite = open(tmpLocal, 'w') lines = '\n'.join(list(set(lines.split('\n')))[1:]) fWrite.write(lines) fWrite.close() client.copy_from_local(tmpLocal,remote) else: client.copy_from_local(local, remote)
def load_fields_with_vocab(self, hdfs_host: str) -> Dict[str, Field]: fs = HdfsClient(hdfs_host) if fs.exists(self.fields_path): print(f'get fields from {hdfs_host}{self.fields_path}') else: raise Exception(f'there are no fields in {hdfs_host}{self.fields_path}') loaded_dict = json.loads(fs.open(self.fields_path).read()) return {k: self.dict_to_field(v) for k, v in loaded_dict.items()}
def load_fields_from_c3(self) -> Dict[str, Field]: fs = HdfsClient(self.C3_HDFS_HOST, user_name=self.user_name) if fs.exists(self.c3_fields_path): print(f'get fields from {self.C3_HDFS_HOST}{self.c3_fields_path}') else: raise Exception(f'there are no fields in {self.C3_HDFS_HOST}{self.c3_fields_path}') loaded_dict = json.loads(fs.open(self.c3_fields_path).read()) print(loaded_dict) max_vocab_indexes = {k: v['max_vocab_index'] for k, v in loaded_dict.items()} return {k: self.dict_to_field(v) for k, v in loaded_dict.items()}, max_vocab_indexes
def Copy_From_Local(file): ''' 上传文件到hadoop ''' h_file = ('/tmp/te/%s' % file) client = HdfsClient(hosts='localhost:50070') #hdfs地址,连接hdfs if client.exists(h_file): client.delete(h_file) #判断文件是否存在于hdfs,存在就删除 client.copy_from_local(file, h_file)
def basic(): client = HdfsClient(hosts='study:50070') print(client.list_status('/')) print '判断某个路径是否存在' print client.exists("/test") print client.exists("/data/gz/thrift-0.9.2.tar.gz") client = HdfsClient(hosts='study:50070') print client.get_file_checksum("/data/gz/bison-2.5.1.tar.gz") summary = client.get_content_summary("/") print summary #文件拷贝--从HDFS拷贝到本地磁盘系统 client.copy_to_local("/data/gz/pip-7.1.2.tar.gz","/root/data/pip-7.1.2.tar.gz") #文件拷贝--从本地磁盘系统拷贝到HDFS系统中 client.copy_from_local("/root/data/thrift-0.9.2.tar.gz","/data/gz/thrift-0.9.2.tar.gz") print client.get_home_directory()
def save_img(path, corlor_pic): # 创建HDFS连接客户端 client = HdfsClient(hosts="192.168.2.109", user_name="hadoop") # 读取本地图片(也可自己通过numpy模块生成) # mat = cv2.imread(r"C:\Users\HUAWEI\Pictures\1.png") corlor_pic = cv2.resize( corlor_pic, (corlor_pic.shape[1] // 1, corlor_pic.shape[0] // 1)) # hdfs保存路径 # 写入hdfs if client.exists(path): client.delete(path) client.create(path, cv2.imencode('.png', corlor_pic)[1].tobytes())
def load_matrix(self, filepath, shape=None): if os.environ['local'] == '1' and os.path.exists(filepath): return np.loadtxt(filepath, dtype=np.float) else: hosts = os.environ['hosts'] if len(hosts) == 0: hosts = 'master' client = HdfsClient(hosts=hosts) if client.exists(filepath): return np.fromstring( client.open(filepath).read()).reshape(shape) return False
def _load_preprocessed(self) -> List[Example]: fs = HdfsClient(self.C3_HDFS_HOST, user_name=self.user_name) if fs.exists(self.c3_path): print(f'get preprocessed corpus from {self.C3_HDFS_HOST}{self.c3_path}') else: raise Exception(f'there are no preprocessed in {self.C3_HDFS_HOST}{self.c3_path}') preprocessed = [] for line in fs.open(self.c3_path).read().decode().split('\n'): if line: ex = Example() for k, v in json.loads(line).items(): setattr(ex, k, v) preprocessed.append(ex) return preprocessed
def test_read_files_with_spaces(started_cluster): hdfs_api = started_cluster.hdfs_api fs = HdfsClient(hosts=started_cluster.hdfs_ip) dir = '/test_spaces' exists = fs.exists(dir) if exists: fs.delete(dir, recursive=True) fs.mkdirs(dir) hdfs_api.write_data(f"{dir}/test test test 1.txt", "1\n") hdfs_api.write_data(f"{dir}/test test test 2.txt", "2\n") hdfs_api.write_data(f"{dir}/test test test 3.txt", "3\n") node1.query(f"create table test (id UInt32) ENGINE = HDFS('hdfs://hdfs1:9000/{dir}/test*', 'TSV')") assert node1.query("select * from test order by id") == "1\n2\n3\n" fs.delete(dir, recursive=True)
def test_hdfsCluster(started_cluster): hdfs_api = started_cluster.hdfs_api fs = HdfsClient(hosts=started_cluster.hdfs_ip) dir = '/test_hdfsCluster' exists = fs.exists(dir) if exists: fs.delete(dir, recursive=True) fs.mkdirs(dir) hdfs_api.write_data("/test_hdfsCluster/file1", "1\n") hdfs_api.write_data("/test_hdfsCluster/file2", "2\n") hdfs_api.write_data("/test_hdfsCluster/file3", "3\n") actual = node1.query("select id, _file as file_name, _path as file_path from hdfs('hdfs://hdfs1:9000/test_hdfsCluster/file*', 'TSV', 'id UInt32') order by id") expected = "1\tfile1\thdfs://hdfs1:9000/test_hdfsCluster/file1\n2\tfile2\thdfs://hdfs1:9000/test_hdfsCluster/file2\n3\tfile3\thdfs://hdfs1:9000/test_hdfsCluster/file3\n" assert actual == expected actual = node1.query("select id, _file as file_name, _path as file_path from hdfsCluster('test_cluster_two_shards', 'hdfs://hdfs1:9000/test_hdfsCluster/file*', 'TSV', 'id UInt32') order by id") expected = "1\tfile1\thdfs://hdfs1:9000/test_hdfsCluster/file1\n2\tfile2\thdfs://hdfs1:9000/test_hdfsCluster/file2\n3\tfile3\thdfs://hdfs1:9000/test_hdfsCluster/file3\n" assert actual == expected fs.delete(dir, recursive=True)
x['pathSuffix'] for x in client.list_status(dir_dataroot) if x['type'] == 'DIRECTORY' ] list_dirs.sort() for subdir in list_dirs: dir_subdata = os.path.join(dir_dataroot, subdir) logger.debug('data path : %s' % dir_subdata) dir_subdata_cleaned = os.path.join(dir_subdata, 'cleaned4netsec') logger.debug('data path for cleaned files : %s' % dir_subdata_cleaned) list_subdir_date = [ x['pathSuffix'] for x in client.list_status(dir_subdata) if x['type'] == 'FILE' ] if len(list_subdir_date) > 0: if not client.exists(dir_subdata_cleaned): client.mkdirs(dir_subdata_cleaned) logger.debug('mkdir dir for cleaned files : %s' % dir_subdata_cleaned) list_subdir_date_cleaned = [ x['pathSuffix'] for x in client.list_status(dir_subdata_cleaned) if x['type'] == 'FILE' ] list_subdir_date.sort() for fname in list_subdir_date: if fname in list_subdir_date_cleaned: # #TODO: to debug # if client.exists(os.path.join(dir_subdata_cleaned, fname)):
class hdfs(object): #默认50070端口 def __init__(self, cur_database_param): # super(HdfsClients, self).__init__() # self.quert_db_info = super(HdfsClients, self).getDBConfig() # self.hdfsHost=self.quert_db_info["host"] hdfsHost = cur_database_param['url'] path = cur_database_param['dbname'] self.hdfs = HdfsClient(hosts='{hdfs_host}'.format(hdfs_host=hdfsHost)) self.host = hdfsHost self.path = path def append(self, path, data): self.hdfs.append(path, data) pass def concat(self, target, sources): self.concat(target, sources) # self, taskJobId,tableName=None,jobTemplateFieldList=None def createTableByTaskJobId(self, taskJobId, tableName=None, jobTemplateFieldList=None, data=None): if tableName == None: taskJob = TaskJobDao.loadTaskById(taskJobId) tableName = taskJob.tableName path = self.path + '/' + tableName self.hdfs.create(path, data, replication=2) def hmkdirs(self, path): self.hdfs.mkdirs(path) def open(self, path): return self.hdfs.open(path=path) def delete(self, path): self.hdfs.delete(path=path) def listdir(self, rule): f = self.hdfs.listdir(rule) return f def insert(self, jobid, tablename, column_dict, paramMap=None): if tablename == None: taskJob = TaskJobDao.loadTaskById(jobid) tablename = taskJob.tableName path = self.path + '/' + tablename createTime = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())) task_job_id_sequenceValue = paramMap.get( "task_job_id_sequence") if paramMap != None else None if task_job_id_sequenceValue != None: column_dict.update( {"task_job_id_sequence": str(task_job_id_sequenceValue)}) column_dict.update({ "task_job_del_flag": "False", "task_job_create_time": createTime }) # self.append(path, column_dict) if self.isTableExist(tablename): self.append(path, column_dict) else: self.createTableByTaskJobId(jobid, tablename, column_dict) # return column_dict def isTableExist(self, tablename): path = self.path + '/' + tablename exist = self.hdfs.exists(path) return exist def save_to_hdfs(self, jobid, path, data): if self.isTableExist(path): self.append(path, data) else: self.createTableByTaskJobId(jobid, path, data) def save_to_hdfs2(self, path, data): if self.hdfs.exists(path): self.hdfs.append(path, data) else: self.hdfs.create(path, data, replication=2) def execute(self, sqls="append", path=None, data=None): try: if isinstance(sqls, list) and len(sqls) > 0: for sql in sqls: # method = eval(sql) method = getattr(self, sql) method(path, data) else: # method = eval(sqls) method = getattr(self, sqls) method(path, data) except Exception, e: logging.error("hdfs,execute," + str(e)) raise Exception()
def yesterday(): return today() - datetime.timedelta(days=1) # 执行主方法 if __name__ == '__main__': print "监控HDFS......" yesterday_datetime_format = yesterday() for table in CHECK_TABLE: is_success = False has_data = False content = "" try: path = ROOT_DIR + table + "/" + str(yesterday_datetime_format) if client.exists(path): client_list = client.list_status(path) for file_status in client_list: if (file_status.get("pathSuffix").startswith('part-')) and (int(file_status.get("length")) > 0): has_data = True elif file_status.get("pathSuffix").__eq__("_SUCCESS"): is_success = True else: content = "异常信息:HDFS路径不存在 <br>" + \ str("HDFS路径:") + path except Exception, e: content = "异常信息:" + str(e) + "<br>" + \ str("HDFS路径:") + path if (content == "") and (not is_success): content = "异常信息:" + table + "相关job运行失败" + "<br>" + \