def __push_fields(self, hdfs_host: str, fields: Dict[str, Field]): fs = HdfsClient(hdfs_host) fs.mkdirs('/'.join(self.fields_path.split('/')[:-1])) fs.delete(self.fields_path) dicted_fields = {k: self.field_to_dict(v) for k, v in fields.items()} fs.create(self.fields_path, json.dumps(dicted_fields)) fs = HdfsClient(self.C3_HDFS_HOST, user_name=self.user_name) fs.mkdirs('/'.join(self.c3_fields_path.split('/')[:-1])) fs.delete(self.c3_fields_path) c3_dicted_fields = {} for k, value in dicted_fields.items(): if value['use_vocab']: max_vocab_index = len(value['vocab']['itos']) value['max_vocab_index'] = max_vocab_index value['dtype'] = str(torch.int64) vocab = value['vocab'] for tok in self.FIELDS_TOKEN_ATTRS: if value[tok]: value[tok] = vocab['stoi'][value[tok]] value.pop('vocab') value['use_vocab'] = False else: value['max_vocab_index'] = 1 c3_dicted_fields[k] = value fs.create(self.c3_fields_path, json.dumps(c3_dicted_fields))
class HDFSSErvice: namenode_host = "localhost" namenode_port = "9870" root_folder = "/" chunck_size = 100000 def __init__(self): self._client = HdfsClient(hosts=self.namenode_host + ":" + self.namenode_port, user_name="root") def get(self, hdfs_path: str): file_size = self.get_file_size(hdfs_path) for i in range(0, file_size, self.chunck_size): file_response = self._client.open(hdfs_path, offset=i, length=i + self.chunck_size) yield file_response.read() def append(self, hdfs_path: str, data: bytes): self.create_if_not_exist(hdfs_path) self._client.append(hdfs_path, data) def create_if_not_exist(self, hdfs_path: str): if not self._client.exists(hdfs_path): self._client.create(hdfs_path, b"") def get_messages_number(self, hdfs_path: str): return int(self.get_file_size(hdfs_path) / self.chunck_size + 1) def get_file_size(self, hdfs_path): file_infos = self._client.get_content_summary(hdfs_path) return file_infos.length def test(self): pass
def upload_txt_to_hdfs(arr): client = HdfsClient(hosts="localhost:50070", user_name="Alphalbj") name = "/words/words-" + datetime.datetime.now().strftime( '%Y-%m-%d-%H-%M-%S') + ".txt" content = "" for word in arr: content += word + " " client.create(name, content.encode('utf-8'))
def _setup_walk(self, client: HdfsClient) -> Callable[..., str]: def path(*args: str) -> str: return posixpath.join(TEST_DIR, *args) self._make_empty_dir(client) client.create(path("f1"), b"") client.mkdirs(path("a1", "b1")) client.create(path("a1", "b1", "f2"), b"") client.mkdirs(path("a1", "b2")) client.mkdirs(path("a2")) return path
def save_img(path, corlor_pic): # 创建HDFS连接客户端 client = HdfsClient(hosts="192.168.2.109", user_name="hadoop") # 读取本地图片(也可自己通过numpy模块生成) # mat = cv2.imread(r"C:\Users\HUAWEI\Pictures\1.png") corlor_pic = cv2.resize( corlor_pic, (corlor_pic.shape[1] // 1, corlor_pic.shape[0] // 1)) # hdfs保存路径 # 写入hdfs if client.exists(path): client.delete(path) client.create(path, cv2.imencode('.png', corlor_pic)[1].tobytes())
def save_model(self, savemodel, model, appendix=None): if(savemodel): c3_path = f'/user/{self.username}/fortuna/model/{self.trainfile}_{self.testnum}/model' fs = HdfsClient(self.C3_HDFS_HOST, user_name=self.username) if appendix: c3_path += f'_{appendix}' model_pickle = pickle.dumps(model.state_dict()) try: fs.create(c3_path, model_pickle, overwrite=True) except Exception as e: print(e) else: file_name = f'data_out/model' if appendix: file_name += f'_{appendix}' torch.save({'model': model.state_dict(), 'task': type(self.task).__name__}, file_name)
def _make_dir_and_file(self, client: HdfsClient) -> None: self._make_empty_dir(client) client.create(TEST_FILE, FILE_CONTENTS)
logger.debug('doing file : %s' % f_fullname) f = client.open(f_fullname) try: f_context = f.read().decode('gbk') except UnicodeDecodeError as e: logger.error('decode error : %s' % f_fullname) logger.error(e) dir_error = os.path.join(dir_subdata, 'error_cleaning') if not client.exists(dir_error): client.mkdirs(dir_error) logger.debug('mkdir dir for error files : %s' % dir_error) #TODO: if success delete error files fname_error = os.path.join(dir_error, fname) if not client.exists(fname_error): client.create(fname_error, None) logger.warn('create error flag file : %s' % fname_error) continue finally: f.close() list_datalines = f_context.split('\n') s_write_buffer = '' for line in list_datalines: a = public.transform_line(constants, line) if a: a.append(s_guapairiqi) s_write_buffer += '\t'.join(a) s_write_buffer += '\n'
class hdfs(object): #默认50070端口 def __init__(self, cur_database_param): # super(HdfsClients, self).__init__() # self.quert_db_info = super(HdfsClients, self).getDBConfig() # self.hdfsHost=self.quert_db_info["host"] hdfsHost = cur_database_param['url'] path = cur_database_param['dbname'] self.hdfs = HdfsClient(hosts='{hdfs_host}'.format(hdfs_host=hdfsHost)) self.host = hdfsHost self.path = path def append(self, path, data): self.hdfs.append(path, data) pass def concat(self, target, sources): self.concat(target, sources) # self, taskJobId,tableName=None,jobTemplateFieldList=None def createTableByTaskJobId(self, taskJobId, tableName=None, jobTemplateFieldList=None, data=None): if tableName == None: taskJob = TaskJobDao.loadTaskById(taskJobId) tableName = taskJob.tableName path = self.path + '/' + tableName self.hdfs.create(path, data, replication=2) def hmkdirs(self, path): self.hdfs.mkdirs(path) def open(self, path): return self.hdfs.open(path=path) def delete(self, path): self.hdfs.delete(path=path) def listdir(self, rule): f = self.hdfs.listdir(rule) return f def insert(self, jobid, tablename, column_dict, paramMap=None): if tablename == None: taskJob = TaskJobDao.loadTaskById(jobid) tablename = taskJob.tableName path = self.path + '/' + tablename createTime = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())) task_job_id_sequenceValue = paramMap.get( "task_job_id_sequence") if paramMap != None else None if task_job_id_sequenceValue != None: column_dict.update( {"task_job_id_sequence": str(task_job_id_sequenceValue)}) column_dict.update({ "task_job_del_flag": "False", "task_job_create_time": createTime }) # self.append(path, column_dict) if self.isTableExist(tablename): self.append(path, column_dict) else: self.createTableByTaskJobId(jobid, tablename, column_dict) # return column_dict def isTableExist(self, tablename): path = self.path + '/' + tablename exist = self.hdfs.exists(path) return exist def save_to_hdfs(self, jobid, path, data): if self.isTableExist(path): self.append(path, data) else: self.createTableByTaskJobId(jobid, path, data) def save_to_hdfs2(self, path, data): if self.hdfs.exists(path): self.hdfs.append(path, data) else: self.hdfs.create(path, data, replication=2) def execute(self, sqls="append", path=None, data=None): try: if isinstance(sqls, list) and len(sqls) > 0: for sql in sqls: # method = eval(sql) method = getattr(self, sql) method(path, data) else: # method = eval(sqls) method = getattr(self, sqls) method(path, data) except Exception, e: logging.error("hdfs,execute," + str(e)) raise Exception()
#!/usr/bin/env python # -*- coding:utf-8 -*- from pyhdfs import HdfsClient ''' python链接hadoop的hdfs文件系统,进行文件的上传和下载 ''' # 从hdfs文件系统读取文件 # hdfs地址 # client = HdfsClient(hosts='192.168.1.163:50070') client = HdfsClient(hosts='192.168.1.156:50070') print(client.listdir("/repo/")) res = client.open('/repo/README.txt') for r in res: line = str(r, encoding='utf-8') # open后是二进制,str()转换为字符串并转码 print(line) client = HdfsClient(hosts='192.168.1.156:50070', user_name='hadoop') # 只有hadoop用户拥有写权限 str1 = 'hello world' client.create('/py.txt', str1) # 创建新文件并写入字符串 # 上传本地文件到HDFS # client = HdfsClient(hosts='hacker:50070', user_name='root') # 本地文件绝对路径,HDFS目录必须不存在 # client.copy_from_local('D:/PythonProjects/crawl_work/thread_crawl_work02', '/usr/hadoop/')