def __push_fields(self, hdfs_host: str, fields: Dict[str, Field]): fs = HdfsClient(hdfs_host) fs.mkdirs('/'.join(self.fields_path.split('/')[:-1])) fs.delete(self.fields_path) dicted_fields = {k: self.field_to_dict(v) for k, v in fields.items()} fs.create(self.fields_path, json.dumps(dicted_fields)) fs = HdfsClient(self.C3_HDFS_HOST, user_name=self.user_name) fs.mkdirs('/'.join(self.c3_fields_path.split('/')[:-1])) fs.delete(self.c3_fields_path) c3_dicted_fields = {} for k, value in dicted_fields.items(): if value['use_vocab']: max_vocab_index = len(value['vocab']['itos']) value['max_vocab_index'] = max_vocab_index value['dtype'] = str(torch.int64) vocab = value['vocab'] for tok in self.FIELDS_TOKEN_ATTRS: if value[tok]: value[tok] = vocab['stoi'][value[tok]] value.pop('vocab') value['use_vocab'] = False else: value['max_vocab_index'] = 1 c3_dicted_fields[k] = value fs.create(self.c3_fields_path, json.dumps(c3_dicted_fields))
def cluster(): try: cluster = ClickHouseCluster(__file__) cluster.add_instance( "node1", main_configs=["configs/config.d/storage_conf.xml"], macros={"replica": "node1"}, with_zookeeper=True, with_hdfs=True, ) cluster.add_instance( "node2", main_configs=["configs/config.d/storage_conf.xml"], macros={"replica": "node2"}, with_zookeeper=True, with_hdfs=True, ) logging.info("Starting cluster...") cluster.start() if cluster.instances["node1"].is_debug_build(): # https://github.com/ClickHouse/ClickHouse/issues/27814 pytest.skip( "libhdfs3 calls rand function which does not pass harmful check in debug build" ) logging.info("Cluster started") fs = HdfsClient(hosts=cluster.hdfs_ip) fs.mkdirs("/clickhouse1") fs.mkdirs("/clickhouse2") logging.info("Created HDFS directory") yield cluster finally: cluster.shutdown()
def ProcAll(LocalDir, HdfsDir): NameNode = GolobalConfig['hdfs']['NameNode'] UserName = GolobalConfig['hdfs']['UserName'] client = HdfsClient(hosts=NameNode, user_name=UserName) if not client.exists(HdfsDir): client.mkdirs(HdfsDir) total = len(os.listdir(LocalDir)) processed = 0 failedList = list() FileSize = 0 StartTime = time.time() for filename in os.listdir(LocalDir): srcFile = os.path.join(LocalDir, filename) dstFile = HdfsDir + '/' + filename if not ProcOne(client, srcFile, dstFile): failedList.append(srcFile) else: FileSize += os.path.getsize(srcFile) processed += 1 print('%d/%d/%d, time cost: %.2f s' % (total, processed, len(failedList), time.time() - StartTime)) print('%d B, %.2f MB/s \n' % (FileSize, FileSize / 1024 / 1024 / (time.time() - StartTime))) if failedList: print('failedList: %s' % repr(failedList)) return False else: print('Good! No Error!') print('%d B, %.2f MB, %.2f GB, %.2f MB/s' % \ (FileSize, FileSize/1024/1024, FileSize/1024/1024/1024, FileSize/1024/1024/(time.time()-StartTime))) return True
def cluster(): try: cluster = ClickHouseCluster(__file__) cluster.add_instance("node", main_configs=["configs/config.d/storage_conf.xml"], with_hdfs=True) logging.info("Starting cluster...") cluster.start() logging.info("Cluster started") fs = HdfsClient(hosts=cluster.hdfs_ip) fs.mkdirs('/clickhouse') logging.info("Created HDFS directory") yield cluster finally: cluster.shutdown()
def test_read_files_with_spaces(started_cluster): hdfs_api = started_cluster.hdfs_api fs = HdfsClient(hosts=started_cluster.hdfs_ip) dir = '/test_spaces' exists = fs.exists(dir) if exists: fs.delete(dir, recursive=True) fs.mkdirs(dir) hdfs_api.write_data(f"{dir}/test test test 1.txt", "1\n") hdfs_api.write_data(f"{dir}/test test test 2.txt", "2\n") hdfs_api.write_data(f"{dir}/test test test 3.txt", "3\n") node1.query(f"create table test (id UInt32) ENGINE = HDFS('hdfs://hdfs1:9000/{dir}/test*', 'TSV')") assert node1.query("select * from test order by id") == "1\n2\n3\n" fs.delete(dir, recursive=True)
def test_hdfsCluster(started_cluster): hdfs_api = started_cluster.hdfs_api fs = HdfsClient(hosts=started_cluster.hdfs_ip) dir = '/test_hdfsCluster' exists = fs.exists(dir) if exists: fs.delete(dir, recursive=True) fs.mkdirs(dir) hdfs_api.write_data("/test_hdfsCluster/file1", "1\n") hdfs_api.write_data("/test_hdfsCluster/file2", "2\n") hdfs_api.write_data("/test_hdfsCluster/file3", "3\n") actual = node1.query("select id, _file as file_name, _path as file_path from hdfs('hdfs://hdfs1:9000/test_hdfsCluster/file*', 'TSV', 'id UInt32') order by id") expected = "1\tfile1\thdfs://hdfs1:9000/test_hdfsCluster/file1\n2\tfile2\thdfs://hdfs1:9000/test_hdfsCluster/file2\n3\tfile3\thdfs://hdfs1:9000/test_hdfsCluster/file3\n" assert actual == expected actual = node1.query("select id, _file as file_name, _path as file_path from hdfsCluster('test_cluster_two_shards', 'hdfs://hdfs1:9000/test_hdfsCluster/file*', 'TSV', 'id UInt32') order by id") expected = "1\tfile1\thdfs://hdfs1:9000/test_hdfsCluster/file1\n2\tfile2\thdfs://hdfs1:9000/test_hdfsCluster/file2\n3\tfile3\thdfs://hdfs1:9000/test_hdfsCluster/file3\n" assert actual == expected fs.delete(dir, recursive=True)
def __push_preprocessed(self, c3_path:str, user_name:str, dataset: Dataset): def push_to_hdfs(jstrs): if not fs.exists(c3_path): fs.create(c3_path, '\n'.join(jstrs) + '\n') else: fs.append(c3_path, '\n'.join(jstrs) + '\n') fs = HdfsClient(self.C3_HDFS_HOST, user_name=user_name) fs.mkdirs('/'.join(c3_path.split('/')[:-1])) fs.delete(c3_path) jstrs = [] BUFSIZE = 2048 for fxed_instance in tqdm(Iterator(dataset, batch_size=1), maxinterval=len(dataset.examples)): fxed_instance_dict = {name: getattr(fxed_instance, name).tolist()[0] for name in self.fields.keys()} jstrs.append(json.dumps(fxed_instance_dict)) if len(jstrs) >= BUFSIZE: push_to_hdfs(jstrs) jstrs = [] if jstrs: push_to_hdfs(jstrs)
def start(): # 连接MongoDB,查询tokens,根据contractAddress到etherscan查询最新数据 client = MongoCluster().connect() db = client.get_database('gse-transaction') collection = db.get_collection('mrout_6000001-6001000') # collection.insert_one() # 连接HDFS读取文件 from pyhdfs import HdfsClient client2 = HdfsClient(hosts='%s,50070' % hdfs_ip, max_tries=10) # 返回这个用户的根目录 print client2.get_home_directory() # 返回可用的namenode节点 print client2.get_active_namenode() # 返回指定目录下的所有文件 print client2.listdir("/user/leon/mrout_3_6000001-6001000/") # 读某个文件 client2.mkdirs("/user/leon") inputfile = client2.open('/user/leon/mrout_3_6000001-6001000/part-00000') # 查看文件内容 for r in inputfile: line = str(r).encode('utf-8') # open后是二进制,str()转换为字符串并转码 print(line)
def _setup_walk(self, client: HdfsClient) -> Callable[..., str]: def path(*args: str) -> str: return posixpath.join(TEST_DIR, *args) self._make_empty_dir(client) client.create(path("f1"), b"") client.mkdirs(path("a1", "b1")) client.create(path("a1", "b1", "f2"), b"") client.mkdirs(path("a1", "b2")) client.mkdirs(path("a2")) return path
def _make_empty_dir(self, client: HdfsClient) -> None: # Get an empty dir client.delete(TEST_DIR, recursive=True) assert not client.delete(TEST_DIR, recursive=True) assert client.mkdirs(TEST_DIR)
if x['type'] == 'DIRECTORY' ] list_dirs.sort() for subdir in list_dirs: dir_subdata = os.path.join(dir_dataroot, subdir) logger.debug('data path : %s' % dir_subdata) dir_subdata_cleaned = os.path.join(dir_subdata, 'cleaned4netsec') logger.debug('data path for cleaned files : %s' % dir_subdata_cleaned) list_subdir_date = [ x['pathSuffix'] for x in client.list_status(dir_subdata) if x['type'] == 'FILE' ] if len(list_subdir_date) > 0: if not client.exists(dir_subdata_cleaned): client.mkdirs(dir_subdata_cleaned) logger.debug('mkdir dir for cleaned files : %s' % dir_subdata_cleaned) list_subdir_date_cleaned = [ x['pathSuffix'] for x in client.list_status(dir_subdata_cleaned) if x['type'] == 'FILE' ] list_subdir_date.sort() for fname in list_subdir_date: if fname in list_subdir_date_cleaned: # #TODO: to debug # if client.exists(os.path.join(dir_subdata_cleaned, fname)): # print (os.path.join(dir_subdata_cleaned, fname))
class hdfs(object): #默认50070端口 def __init__(self, cur_database_param): # super(HdfsClients, self).__init__() # self.quert_db_info = super(HdfsClients, self).getDBConfig() # self.hdfsHost=self.quert_db_info["host"] hdfsHost = cur_database_param['url'] path = cur_database_param['dbname'] self.hdfs = HdfsClient(hosts='{hdfs_host}'.format(hdfs_host=hdfsHost)) self.host = hdfsHost self.path = path def append(self, path, data): self.hdfs.append(path, data) pass def concat(self, target, sources): self.concat(target, sources) # self, taskJobId,tableName=None,jobTemplateFieldList=None def createTableByTaskJobId(self, taskJobId, tableName=None, jobTemplateFieldList=None, data=None): if tableName == None: taskJob = TaskJobDao.loadTaskById(taskJobId) tableName = taskJob.tableName path = self.path + '/' + tableName self.hdfs.create(path, data, replication=2) def hmkdirs(self, path): self.hdfs.mkdirs(path) def open(self, path): return self.hdfs.open(path=path) def delete(self, path): self.hdfs.delete(path=path) def listdir(self, rule): f = self.hdfs.listdir(rule) return f def insert(self, jobid, tablename, column_dict, paramMap=None): if tablename == None: taskJob = TaskJobDao.loadTaskById(jobid) tablename = taskJob.tableName path = self.path + '/' + tablename createTime = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())) task_job_id_sequenceValue = paramMap.get( "task_job_id_sequence") if paramMap != None else None if task_job_id_sequenceValue != None: column_dict.update( {"task_job_id_sequence": str(task_job_id_sequenceValue)}) column_dict.update({ "task_job_del_flag": "False", "task_job_create_time": createTime }) # self.append(path, column_dict) if self.isTableExist(tablename): self.append(path, column_dict) else: self.createTableByTaskJobId(jobid, tablename, column_dict) # return column_dict def isTableExist(self, tablename): path = self.path + '/' + tablename exist = self.hdfs.exists(path) return exist def save_to_hdfs(self, jobid, path, data): if self.isTableExist(path): self.append(path, data) else: self.createTableByTaskJobId(jobid, path, data) def save_to_hdfs2(self, path, data): if self.hdfs.exists(path): self.hdfs.append(path, data) else: self.hdfs.create(path, data, replication=2) def execute(self, sqls="append", path=None, data=None): try: if isinstance(sqls, list) and len(sqls) > 0: for sql in sqls: # method = eval(sql) method = getattr(self, sql) method(path, data) else: # method = eval(sqls) method = getattr(self, sqls) method(path, data) except Exception, e: logging.error("hdfs,execute," + str(e)) raise Exception()