def update_csv(): local = '/Users/constantine/PycharmProjects/test02/data.csv' tmpLocal = '/Users/constantine/PycharmProjects/test02/tmpdata.csv' remote = '/data/data.csv' host = '127.0.0.1:9870' user_name = 'host' client = HdfsClient(hosts=host,user_name=user_name) if client.exists(remote): client.copy_to_local(remote,tmpLocal) client.delete(remote) fRead = open(local,'r') fWrite = open(tmpLocal,'w') lines = fRead.readlines() for line in lines: fWrite.writelines(lines) fRead.close() fWrite.close() fRead = open(local, 'r') lines = fRead.read() fRead.close() fWrite = open(tmpLocal, 'w') lines = '\n'.join(list(set(lines.split('\n')))[1:]) fWrite.write(lines) fWrite.close() client.copy_from_local(tmpLocal,remote) else: client.copy_from_local(local, remote)
def __push_fields(self, hdfs_host: str, fields: Dict[str, Field]): fs = HdfsClient(hdfs_host) fs.mkdirs('/'.join(self.fields_path.split('/')[:-1])) fs.delete(self.fields_path) dicted_fields = {k: self.field_to_dict(v) for k, v in fields.items()} fs.create(self.fields_path, json.dumps(dicted_fields)) fs = HdfsClient(self.C3_HDFS_HOST, user_name=self.user_name) fs.mkdirs('/'.join(self.c3_fields_path.split('/')[:-1])) fs.delete(self.c3_fields_path) c3_dicted_fields = {} for k, value in dicted_fields.items(): if value['use_vocab']: max_vocab_index = len(value['vocab']['itos']) value['max_vocab_index'] = max_vocab_index value['dtype'] = str(torch.int64) vocab = value['vocab'] for tok in self.FIELDS_TOKEN_ATTRS: if value[tok]: value[tok] = vocab['stoi'][value[tok]] value.pop('vocab') value['use_vocab'] = False else: value['max_vocab_index'] = 1 c3_dicted_fields[k] = value fs.create(self.c3_fields_path, json.dumps(c3_dicted_fields))
def hdfs_clean(host, user_name, output_dir, experiment_id=None): '''clean up hdfs data''' hdfs_client = HdfsClient(hosts='{0}:80'.format(host), user_name=user_name, webhdfs_path='/webhdfs/api/v1', timeout=5) if experiment_id: full_path = '/' + '/'.join( [user_name, 'nni', 'experiments', experiment_id]) else: full_path = '/' + '/'.join([user_name, 'nni', 'experiments']) print_normal('removing folder {0} in hdfs'.format(full_path)) hdfs_client.delete(full_path, recursive=True) if output_dir: pattern = re.compile( 'hdfs://(?P<host>([0-9]{1,3}.){3}[0-9]{1,3})(:[0-9]{2,5})?(?P<baseDir>/.*)?' ) match_result = pattern.match(output_dir) if match_result: output_host = match_result.group('host') output_dir = match_result.group('baseDir') #check if the host is valid if output_host != host: print_warning( 'The host in {0} is not consistent with {1}'.format( output_dir, host)) else: if experiment_id: output_dir = output_dir + '/' + experiment_id print_normal('removing folder {0} in hdfs'.format(output_dir)) hdfs_client.delete(output_dir, recursive=True)
def Copy_From_Local(file): ''' 上传文件到hadoop ''' h_file = ('/tmp/te/%s' % file) client = HdfsClient(hosts='localhost:50070') #hdfs地址,连接hdfs if client.exists(h_file): client.delete(h_file) #判断文件是否存在于hdfs,存在就删除 client.copy_from_local(file, h_file)
def save_img(path, corlor_pic): # 创建HDFS连接客户端 client = HdfsClient(hosts="192.168.2.109", user_name="hadoop") # 读取本地图片(也可自己通过numpy模块生成) # mat = cv2.imread(r"C:\Users\HUAWEI\Pictures\1.png") corlor_pic = cv2.resize( corlor_pic, (corlor_pic.shape[1] // 1, corlor_pic.shape[0] // 1)) # hdfs保存路径 # 写入hdfs if client.exists(path): client.delete(path) client.create(path, cv2.imencode('.png', corlor_pic)[1].tobytes())
class DeleteHdfsData(): # 初始化 def __init__(self): self.host = "172.27.133.18" self.port = "8020" self.userName = "******" self.remotePath = "/user/shiyouguandao" self.fs = HdfsClient(self.host, self.userName) self.ReadHdfsFile() # 读取文件 def ReadHdfsFile(self): count = self.HdfsFileList(self.remotePath) localtime = time.asctime(time.localtime(time.time())) log.info(localtime + "\t共删除/user/shiyouguandao下\t" + str(count) + "个csv文件...") # 文件删除 def DeleteHdfsFile(self, hdfsPath): # self.fs.delete(hdfsPath, skip_trash=False) self.fs.delete( "/user/shiyouguandao/feature_ZSY-69_2019-09-24_23411.csv", skip_trash=False) #文件目录列表遍历判定 def HdfsFileList(self, path): count = 0 for root, dirs, files in self.fs.walk(path): for file in files: is_csv = self.hdfsFileHandler(file) if is_csv: self.DeleteHdfsFile(path + "/" + file) count += 1 return count #文件过滤 def hdfsFileHandler(self, fileName): if fileName.endswith(".csv"): temp = 60 * 60 * 24 index = fileName.rfind("_") str = fileName[index - len(fileName) - 10:index - len(fileName)] current = int(time.time()) fileTime = int(time.mktime(time.strptime(str, "%Y-%m-%d"))) if (current - fileTime) >= temp: return True else: return False
def test_read_files_with_spaces(started_cluster): hdfs_api = started_cluster.hdfs_api fs = HdfsClient(hosts=started_cluster.hdfs_ip) dir = '/test_spaces' exists = fs.exists(dir) if exists: fs.delete(dir, recursive=True) fs.mkdirs(dir) hdfs_api.write_data(f"{dir}/test test test 1.txt", "1\n") hdfs_api.write_data(f"{dir}/test test test 2.txt", "2\n") hdfs_api.write_data(f"{dir}/test test test 3.txt", "3\n") node1.query(f"create table test (id UInt32) ENGINE = HDFS('hdfs://hdfs1:9000/{dir}/test*', 'TSV')") assert node1.query("select * from test order by id") == "1\n2\n3\n" fs.delete(dir, recursive=True)
def drop_table(cluster): node = cluster.instances["node"] fs = HdfsClient(hosts=cluster.hdfs_ip) hdfs_objects = fs.listdir('/clickhouse') print('Number of hdfs objects to delete:', len(hdfs_objects), sep=' ') node.query("DROP TABLE IF EXISTS hdfs_test SYNC") try: wait_for_delete_hdfs_objects(cluster, 0) finally: hdfs_objects = fs.listdir('/clickhouse') if len(hdfs_objects) == 0: return print("Manually removing extra objects to prevent tests cascade failing: ", hdfs_objects) for path in hdfs_objects: fs.delete(path)
def test_hdfsCluster(started_cluster): hdfs_api = started_cluster.hdfs_api fs = HdfsClient(hosts=started_cluster.hdfs_ip) dir = '/test_hdfsCluster' exists = fs.exists(dir) if exists: fs.delete(dir, recursive=True) fs.mkdirs(dir) hdfs_api.write_data("/test_hdfsCluster/file1", "1\n") hdfs_api.write_data("/test_hdfsCluster/file2", "2\n") hdfs_api.write_data("/test_hdfsCluster/file3", "3\n") actual = node1.query("select id, _file as file_name, _path as file_path from hdfs('hdfs://hdfs1:9000/test_hdfsCluster/file*', 'TSV', 'id UInt32') order by id") expected = "1\tfile1\thdfs://hdfs1:9000/test_hdfsCluster/file1\n2\tfile2\thdfs://hdfs1:9000/test_hdfsCluster/file2\n3\tfile3\thdfs://hdfs1:9000/test_hdfsCluster/file3\n" assert actual == expected actual = node1.query("select id, _file as file_name, _path as file_path from hdfsCluster('test_cluster_two_shards', 'hdfs://hdfs1:9000/test_hdfsCluster/file*', 'TSV', 'id UInt32') order by id") expected = "1\tfile1\thdfs://hdfs1:9000/test_hdfsCluster/file1\n2\tfile2\thdfs://hdfs1:9000/test_hdfsCluster/file2\n3\tfile3\thdfs://hdfs1:9000/test_hdfsCluster/file3\n" assert actual == expected fs.delete(dir, recursive=True)
def __push_preprocessed(self, c3_path:str, user_name:str, dataset: Dataset): def push_to_hdfs(jstrs): if not fs.exists(c3_path): fs.create(c3_path, '\n'.join(jstrs) + '\n') else: fs.append(c3_path, '\n'.join(jstrs) + '\n') fs = HdfsClient(self.C3_HDFS_HOST, user_name=user_name) fs.mkdirs('/'.join(c3_path.split('/')[:-1])) fs.delete(c3_path) jstrs = [] BUFSIZE = 2048 for fxed_instance in tqdm(Iterator(dataset, batch_size=1), maxinterval=len(dataset.examples)): fxed_instance_dict = {name: getattr(fxed_instance, name).tolist()[0] for name in self.fields.keys()} jstrs.append(json.dumps(fxed_instance_dict)) if len(jstrs) >= BUFSIZE: push_to_hdfs(jstrs) jstrs = [] if jstrs: push_to_hdfs(jstrs)
def _make_empty_dir(self, client: HdfsClient) -> None: # Get an empty dir client.delete(TEST_DIR, recursive=True) assert not client.delete(TEST_DIR, recursive=True) assert client.mkdirs(TEST_DIR)
class HDFSClientUtilityTest(unittest.TestCase): '''Unit test for hdfsClientUtility.py''' def setUp(self): self.hdfs_file_path = '../../.vscode/hdfsInfo.json' self.hdfs_config = None try: with open(self.hdfs_file_path, 'r') as file: self.hdfs_config = json.load(file) except Exception as exception: print(exception) self.hdfs_client = HdfsClient(hosts='{0}:{1}'.format( self.hdfs_config['host'], '50070'), user_name=self.hdfs_config['userName']) def get_random_name(self, length): return ''.join( random.sample(string.ascii_letters + string.digits, length)) def test_copy_file_run(self): '''test copyFileToHdfs''' file_name = self.get_random_name(8) file_content = 'hello world!' with open('./{}'.format(file_name), 'w') as file: file.write(file_content) result = copyFileToHdfs( './{}'.format(file_name), '/{0}/{1}'.format(self.hdfs_config['userName'], file_name), self.hdfs_client) self.assertTrue(result) file_list = self.hdfs_client.listdir('/{0}'.format( self.hdfs_config['userName'])) self.assertIn(file_name, file_list) hdfs_file_name = self.get_random_name(8) self.hdfs_client.copy_to_local( '/{0}/{1}'.format(self.hdfs_config['userName'], file_name), './{}'.format(hdfs_file_name)) self.assertTrue(os.path.exists('./{}'.format(hdfs_file_name))) with open('./{}'.format(hdfs_file_name), 'r') as file: content = file.readline() self.assertEqual(file_content, content) #clean up os.remove('./{}'.format(file_name)) os.remove('./{}'.format(hdfs_file_name)) self.hdfs_client.delete('/{0}/{1}'.format(self.hdfs_config['userName'], file_name)) def test_copy_directory_run(self): '''test copyDirectoryToHdfs''' directory_name = self.get_random_name(8) file_name_list = [self.get_random_name(8), self.get_random_name(8)] file_content = 'hello world!' os.makedirs('./{}'.format(directory_name)) for file_name in file_name_list: with open('./{0}/{1}'.format(directory_name, file_name), 'w') as file: file.write(file_content) result = copyDirectoryToHdfs( './{}'.format(directory_name), '/{0}/{1}'.format(self.hdfs_config['userName'], directory_name), self.hdfs_client) self.assertTrue(result) directory_list = self.hdfs_client.listdir('/{0}'.format( self.hdfs_config['userName'])) self.assertIn(directory_name, directory_list) sub_file_list = self.hdfs_client.listdir('/{0}/{1}'.format( self.hdfs_config['userName'], directory_name)) for file_name in file_name_list: self.assertIn(file_name, sub_file_list) #clean up self.hdfs_client.delete('/{0}/{1}/{2}'.format( self.hdfs_config['userName'], directory_name, file_name)) self.hdfs_client.delete('/{0}/{1}'.format(self.hdfs_config['userName'], directory_name)) shutil.rmtree('./{}'.format(directory_name))
class hdfs(object): #默认50070端口 def __init__(self, cur_database_param): # super(HdfsClients, self).__init__() # self.quert_db_info = super(HdfsClients, self).getDBConfig() # self.hdfsHost=self.quert_db_info["host"] hdfsHost = cur_database_param['url'] path = cur_database_param['dbname'] self.hdfs = HdfsClient(hosts='{hdfs_host}'.format(hdfs_host=hdfsHost)) self.host = hdfsHost self.path = path def append(self, path, data): self.hdfs.append(path, data) pass def concat(self, target, sources): self.concat(target, sources) # self, taskJobId,tableName=None,jobTemplateFieldList=None def createTableByTaskJobId(self, taskJobId, tableName=None, jobTemplateFieldList=None, data=None): if tableName == None: taskJob = TaskJobDao.loadTaskById(taskJobId) tableName = taskJob.tableName path = self.path + '/' + tableName self.hdfs.create(path, data, replication=2) def hmkdirs(self, path): self.hdfs.mkdirs(path) def open(self, path): return self.hdfs.open(path=path) def delete(self, path): self.hdfs.delete(path=path) def listdir(self, rule): f = self.hdfs.listdir(rule) return f def insert(self, jobid, tablename, column_dict, paramMap=None): if tablename == None: taskJob = TaskJobDao.loadTaskById(jobid) tablename = taskJob.tableName path = self.path + '/' + tablename createTime = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())) task_job_id_sequenceValue = paramMap.get( "task_job_id_sequence") if paramMap != None else None if task_job_id_sequenceValue != None: column_dict.update( {"task_job_id_sequence": str(task_job_id_sequenceValue)}) column_dict.update({ "task_job_del_flag": "False", "task_job_create_time": createTime }) # self.append(path, column_dict) if self.isTableExist(tablename): self.append(path, column_dict) else: self.createTableByTaskJobId(jobid, tablename, column_dict) # return column_dict def isTableExist(self, tablename): path = self.path + '/' + tablename exist = self.hdfs.exists(path) return exist def save_to_hdfs(self, jobid, path, data): if self.isTableExist(path): self.append(path, data) else: self.createTableByTaskJobId(jobid, path, data) def save_to_hdfs2(self, path, data): if self.hdfs.exists(path): self.hdfs.append(path, data) else: self.hdfs.create(path, data, replication=2) def execute(self, sqls="append", path=None, data=None): try: if isinstance(sqls, list) and len(sqls) > 0: for sql in sqls: # method = eval(sql) method = getattr(self, sql) method(path, data) else: # method = eval(sqls) method = getattr(self, sqls) method(path, data) except Exception, e: logging.error("hdfs,execute," + str(e)) raise Exception()