class HDFSSErvice: namenode_host = "localhost" namenode_port = "9870" root_folder = "/" chunck_size = 100000 def __init__(self): self._client = HdfsClient(hosts=self.namenode_host + ":" + self.namenode_port, user_name="root") def get(self, hdfs_path: str): file_size = self.get_file_size(hdfs_path) for i in range(0, file_size, self.chunck_size): file_response = self._client.open(hdfs_path, offset=i, length=i + self.chunck_size) yield file_response.read() def append(self, hdfs_path: str, data: bytes): self.create_if_not_exist(hdfs_path) self._client.append(hdfs_path, data) def create_if_not_exist(self, hdfs_path: str): if not self._client.exists(hdfs_path): self._client.create(hdfs_path, b"") def get_messages_number(self, hdfs_path: str): return int(self.get_file_size(hdfs_path) / self.chunck_size + 1) def get_file_size(self, hdfs_path): file_infos = self._client.get_content_summary(hdfs_path) return file_infos.length def test(self): pass
def read_hdfs(filename, host, split_ratio, delimiter=',', normalize=False, dtype=None, header=None, skiprows=None, index_col=False, output_label=True, randomize=False, return_as_dataframe=False, describe=False, label_vector=False): client = HdfsClient(hosts=host) return read_csv(filename=client.open(filename), split_ratio=split_ratio, delimiter=delimiter, normalize=normalize, dtype=dtype, header=header, skiprows=skiprows, index_col=index_col, output_label=output_label, randomize=randomize, return_as_dataframe=return_as_dataframe, describe=describe, label_vector=label_vector)
def __load_corpus_from_hdfs(self, hdfs_host: str) -> List: fs = HdfsClient(hdfs_host) with fs.open(self.corpus_path) as fp: corpus = list() for line in tqdm(fp.read().decode().split('\n')): if line: d = json.loads(line) corpus.append(d) return corpus
def load_fields_with_vocab(self, hdfs_host: str) -> Dict[str, Field]: fs = HdfsClient(hdfs_host) if fs.exists(self.fields_path): print(f'get fields from {hdfs_host}{self.fields_path}') else: raise Exception(f'there are no fields in {hdfs_host}{self.fields_path}') loaded_dict = json.loads(fs.open(self.fields_path).read()) return {k: self.dict_to_field(v) for k, v in loaded_dict.items()}
def load_model(self, train_dir, modelnum, appendix): print('~' * 100) c3_path = f'/user/{self.username}/fortuna/model/{train_dir}_{modelnum}/model_e{appendix}' print(c3_path) fs = HdfsClient(self.C3_HDFS_HOST, user_name=self.username) model_pickle = fs.open(c3_path) model_dict = pickle.load(model_pickle) self.model.load_state_dict(model_dict) acc_lst, total, prec,recall,f1score,f1s,rocauc = self.eval(self.test_iter, len(self.task.te_dataset)) print('~' * 100)
def load_fields_from_c3(self) -> Dict[str, Field]: fs = HdfsClient(self.C3_HDFS_HOST, user_name=self.user_name) if fs.exists(self.c3_fields_path): print(f'get fields from {self.C3_HDFS_HOST}{self.c3_fields_path}') else: raise Exception(f'there are no fields in {self.C3_HDFS_HOST}{self.c3_fields_path}') loaded_dict = json.loads(fs.open(self.c3_fields_path).read()) print(loaded_dict) max_vocab_indexes = {k: v['max_vocab_index'] for k, v in loaded_dict.items()} return {k: self.dict_to_field(v) for k, v in loaded_dict.items()}, max_vocab_indexes
def load_matrix(self, filepath, shape=None): if os.environ['local'] == '1' and os.path.exists(filepath): return np.loadtxt(filepath, dtype=np.float) else: hosts = os.environ['hosts'] if len(hosts) == 0: hosts = 'master' client = HdfsClient(hosts=hosts) if client.exists(filepath): return np.fromstring( client.open(filepath).read()).reshape(shape) return False
def _load_preprocessed(self) -> List[Example]: fs = HdfsClient(self.C3_HDFS_HOST, user_name=self.user_name) if fs.exists(self.c3_path): print(f'get preprocessed corpus from {self.C3_HDFS_HOST}{self.c3_path}') else: raise Exception(f'there are no preprocessed in {self.C3_HDFS_HOST}{self.c3_path}') preprocessed = [] for line in fs.open(self.c3_path).read().decode().split('\n'): if line: ex = Example() for k, v in json.loads(line).items(): setattr(ex, k, v) preprocessed.append(ex) return preprocessed
def read_csv(): client = HdfsClient(hosts='master33:50070', user_name='hadoop') inputfile = client.open('/pdfs/output.csv') df = pd.read_csv(inputfile) print("read done") # 將vector string 轉換為 vector def transfer_vectorStr_to_vector(df): for i in range(0, len(df['vector'])): array = np.fromstring( \ df['vector'][i].replace('[', '').replace(']', ''), \ dtype=np.double, sep=' ') df.at[i, 'vector'] = Vectors.dense(array) return df df = transfer_vectorStr_to_vector(df) return df
def start(): # 连接MongoDB,查询tokens,根据contractAddress到etherscan查询最新数据 client = MongoCluster().connect() db = client.get_database('gse-transaction') collection = db.get_collection('mrout_6000001-6001000') # collection.insert_one() # 连接HDFS读取文件 from pyhdfs import HdfsClient client2 = HdfsClient(hosts='%s,50070' % hdfs_ip, max_tries=10) # 返回这个用户的根目录 print client2.get_home_directory() # 返回可用的namenode节点 print client2.get_active_namenode() # 返回指定目录下的所有文件 print client2.listdir("/user/leon/mrout_3_6000001-6001000/") # 读某个文件 client2.mkdirs("/user/leon") inputfile = client2.open('/user/leon/mrout_3_6000001-6001000/part-00000') # 查看文件内容 for r in inputfile: line = str(r).encode('utf-8') # open后是二进制,str()转换为字符串并转码 print(line)
list_subdir_date.sort() for fname in list_subdir_date: if fname in list_subdir_date_cleaned: # #TODO: to debug # if client.exists(os.path.join(dir_subdata_cleaned, fname)): # print (os.path.join(dir_subdata_cleaned, fname)) # client.delete(os.path.join(dir_subdata_cleaned, fname)) logger.debug('has been cleaned, ignore this file : %s' % fname) continue s_guapairiqi = public.parse_datetime( fname, format_from='%Y-%m-%d-%H-%M.txt') f_fullname = os.path.join(dir_subdata, fname) logger.debug('doing file : %s' % f_fullname) f = client.open(f_fullname) try: f_context = f.read().decode('gbk') except UnicodeDecodeError as e: logger.error('decode error : %s' % f_fullname) logger.error(e) dir_error = os.path.join(dir_subdata, 'error_cleaning') if not client.exists(dir_error): client.mkdirs(dir_error) logger.debug('mkdir dir for error files : %s' % dir_error) #TODO: if success delete error files fname_error = os.path.join(dir_error, fname) if not client.exists(fname_error): client.create(fname_error, None) logger.warn('create error flag file : %s' %
import pandas as pd import numpy as np import h5py from hdfs.client import Client from pyhdfs import HdfsClient ''' import pyarrow as pa pa.hdfs.connect(host='192.168.0.186',port=9870,user='******') #pa.hdfs.connect() ''' client = HdfsClient(hosts='192.168.0.186:9870', user_name='yanrujing') a1 = client.open('/r2/test/transformed.h5') #a1=client.open('/r2/userData/6c0f5b62c3624f6bad70b0a3066e9085/1/csv_header.csv') b1 = a1.read() c1 = h5py.File('/home/chen/桌面/transformed.h5') #http://192.168.0.186:9870/explorer.html#/r2/test/transformed.h5 ''' client2 = Client(url="http://192.168.0.186:9870",root='yanrujing') # client2.read('/r2/userData/6c0f5b62c3624f6bad70b0a3066e9085/1/csv_header.csv') # a2=client2.read('/r2/userData/6c0f5b62c3624f6bad70b0a3066e9085/1/csv_header.csv') with client2.read('/r2/userData/6c0f5b62c3624f6bad70b0a3066e9085/1/csv_header.csv') as reader: #a=pd.read_csv(reader) content = reader.read() ''' a1 = open('/home/chen/桌面/transformed.h5', 'rb') b1 = h5py.File(a1, 'r')
# 打开存储在HDFS上的文件 from pyhdfs import HdfsClient client = HdfsClient(hosts='ghym:50070', user_name='hadoop') inputfile = client.open('/score.txt') # 转化为csv格式 import pandas as pd df = pd.read_table(inputfile, encoding='gbk', sep=',') df.to_csv('demo.csv', encoding='gbk', index=None, columns=['用户名', '电影名', '评分'])
from pyhdfs import HdfsClient import pandas as pd client = HdfsClient(hosts="172.16.18.114:50070,172.16.18.112:50070", user_name='hadoop') path = 'hdfs://172.16.18.112:50070/a3bd481c98f44dde842367b7ebaef4a6/dataSet/vbap8482a43e3883413f8345344efa6b53ec/data/Data.csv' head = client.open(path) data = pd.read_csv(head) print(data)
#!/usr/bin/env python # -*- coding:utf-8 -*- from pyhdfs import HdfsClient ''' python链接hadoop的hdfs文件系统,进行文件的上传和下载 ''' # 从hdfs文件系统读取文件 # hdfs地址 # client = HdfsClient(hosts='192.168.1.163:50070') client = HdfsClient(hosts='192.168.1.156:50070') print(client.listdir("/repo/")) res = client.open('/repo/README.txt') for r in res: line = str(r, encoding='utf-8') # open后是二进制,str()转换为字符串并转码 print(line) client = HdfsClient(hosts='192.168.1.156:50070', user_name='hadoop') # 只有hadoop用户拥有写权限 str1 = 'hello world' client.create('/py.txt', str1) # 创建新文件并写入字符串 # 上传本地文件到HDFS # client = HdfsClient(hosts='hacker:50070', user_name='root') # 本地文件绝对路径,HDFS目录必须不存在 # client.copy_from_local('D:/PythonProjects/crawl_work/thread_crawl_work02', '/usr/hadoop/')
#!/usr/bin/env python3 # _*_ coding: utf-8 _*_ """ 从 HDFS 中读取文件内容 @see:https://pypi.org/project/PyHDFS/ """ __author__ = 'JKong' import pandas as pd from pyhdfs import HdfsClient client = HdfsClient(hosts='10.10.27.47:9870', user_name="hdfs") # TypeError: cannot use a string pattern on a bytes-like object # 从hdfs中读取文件 file = client.open(r"/a_jkong_test_data/1000.txt") # 获取内容 content = file.read() # open后,file是二进制,str()转换为字符串并转码 s = str(content, "utf-8") # 打开本地文件.csv 并写入内容 file = open("./data/data.csv", "w") file.write(s) # pandas读取本地csv文件 train_data = pd.read_csv("./data/data.csv", sep=",", header=None, usecols=[0, 1, 2, 3, 4], names=['id', 'name', 'age', 'gender', 'time']) print(train_data)
class hdfs(object): #默认50070端口 def __init__(self, cur_database_param): # super(HdfsClients, self).__init__() # self.quert_db_info = super(HdfsClients, self).getDBConfig() # self.hdfsHost=self.quert_db_info["host"] hdfsHost = cur_database_param['url'] path = cur_database_param['dbname'] self.hdfs = HdfsClient(hosts='{hdfs_host}'.format(hdfs_host=hdfsHost)) self.host = hdfsHost self.path = path def append(self, path, data): self.hdfs.append(path, data) pass def concat(self, target, sources): self.concat(target, sources) # self, taskJobId,tableName=None,jobTemplateFieldList=None def createTableByTaskJobId(self, taskJobId, tableName=None, jobTemplateFieldList=None, data=None): if tableName == None: taskJob = TaskJobDao.loadTaskById(taskJobId) tableName = taskJob.tableName path = self.path + '/' + tableName self.hdfs.create(path, data, replication=2) def hmkdirs(self, path): self.hdfs.mkdirs(path) def open(self, path): return self.hdfs.open(path=path) def delete(self, path): self.hdfs.delete(path=path) def listdir(self, rule): f = self.hdfs.listdir(rule) return f def insert(self, jobid, tablename, column_dict, paramMap=None): if tablename == None: taskJob = TaskJobDao.loadTaskById(jobid) tablename = taskJob.tableName path = self.path + '/' + tablename createTime = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())) task_job_id_sequenceValue = paramMap.get( "task_job_id_sequence") if paramMap != None else None if task_job_id_sequenceValue != None: column_dict.update( {"task_job_id_sequence": str(task_job_id_sequenceValue)}) column_dict.update({ "task_job_del_flag": "False", "task_job_create_time": createTime }) # self.append(path, column_dict) if self.isTableExist(tablename): self.append(path, column_dict) else: self.createTableByTaskJobId(jobid, tablename, column_dict) # return column_dict def isTableExist(self, tablename): path = self.path + '/' + tablename exist = self.hdfs.exists(path) return exist def save_to_hdfs(self, jobid, path, data): if self.isTableExist(path): self.append(path, data) else: self.createTableByTaskJobId(jobid, path, data) def save_to_hdfs2(self, path, data): if self.hdfs.exists(path): self.hdfs.append(path, data) else: self.hdfs.create(path, data, replication=2) def execute(self, sqls="append", path=None, data=None): try: if isinstance(sqls, list) and len(sqls) > 0: for sql in sqls: # method = eval(sql) method = getattr(self, sql) method(path, data) else: # method = eval(sqls) method = getattr(self, sqls) method(path, data) except Exception, e: logging.error("hdfs,execute," + str(e)) raise Exception()
for day in days_path: hours = client.listdir(day) for hour in hours: path = day + '/' + str(hour) #print path hours_path.append(path) minutes_path = [] for path in hours_path: minutes = client.listdir(path) if len(minutes) != 60: print '[ INFO ] Incomplete minutes (less than 60) in path: ' + str(path) + ' (Count = ' + str(len(minutes)) + ')' for minute in minutes: path_out = path + '/' + str(minute) #print path_out minutes_path.append(path_out) counter = 0 len_contents_list = [] for path in minutes_path: len_contents = len(re.sub('\n$','',client.open(path).read()).split('\n')) len_contents_list.append(len_contents) counter += len_contents print '\n\nTotal number of logs: ' + str(counter) + '\n\n' #ZEND