def put_to_hdfs(result_file): client = Client("http://192.168.53.30:50070") if client.status('/tmp/result.csv', strict=False): client.delete('/tmp/result.csv') client.upload('/tmp', result_file) else: client.upload('/tmp', result_file)
class HdfsClient(object): def __init__(self, url=None): self.url = url self.client = Client(url=url) def ls(self, path): return self.client.list(path) def isFile(self, path): result = self.client.status(path, strict=False) if result: return result[TYPE] == FILE else: return False def mkdir(self, path): self.client.makedirs(path, permission=777) def isDirectory(self, path): result = self.client.status(path, strict=False) if result: return result[TYPE] == DIRECTORY else: return False def upload(self, localSourcePath, remoteDistPath): self.client.upload(remoteDistPath, localSourcePath, overwrite=True) def dowload(self, remoteSourcePath, localDistPath): self.client.download(remoteSourcePath, localDistPath, overwrite=True) def put(self, localSourcePath, remoteDistPath): with open(localSourcePath, "r") as reader, self.client.write(remoteDistPath) as writer: data = reader.read(FILE_SIZE) while data != "": writer.write(data) data = reader.read(FILE_SIZE) def get(self, remoteSourcePath, localDistPath): with self.client.read(remoteSourcePath, chunk_size=FILE_SIZE) as reader, open( localDistPath, "a+") as writer: for chunk in reader: writer.write(chunk)
def upload_data(hdfs_address, previous_task_id, **kwargs): ''' Description: This function helps users to upload a file from local to hdfs Parameters: -hdfs_address: hadoop master node ip address -previous_task_id: this parameter is used for airflow Xcom. By having this parameter, this function can know which data should load into hdfs. Returns: None ''' # retrive file name from previous task through Xcom ti = kwargs['ti'] file_name = ti.xcom_pull(task_ids=previous_task_id) local_file_path = '/home/ubuntu/airflow/dags/data_from_API/' + file_name hdfs_file_path = '/data/' + file_name # connect to hdfs client = Client('http://' + hdfs_address) # upload the file client.upload(hdfs_file_path, local_file_path)
' a study.__init__.py module ' __author__ = 'steven' import os import time from hdfs.client import Client client = Client("http://127.0.0.1:50070", root="/", timeout=100) print(client.makedirs("/test/")) print(client.status("/test/")) print(client.list("/test/")) print(client.delete("/test/", True)) upload_filename = client.upload( "/test/" + str(int(round(time.time() * 1000))) + ".pdf", "test.pdf") print(upload_filename) download_path = os.path.join(os.path.abspath('.'), 'download/hdfs/') if not os.path.exists(download_path): os.makedirs(download_path, True) else: print(download_path, ' is existed.') print( client.download( upload_filename, download_path + str(int(round(time.time() * 1000))) + ".pdf")) print(client.delete(upload_filename)) print(client.delete(upload_filename))
def xpath_config_file(): select_xpath_rule_sql = """select host,xpath,type from stream_extract_xpath_rule where host='%s' and status=0""" rl = LogUtil().get_base_logger() try: # _HAINIU_DB = {'HOST': '192.168.137.190', 'USER': '******', 'PASSWD': '12345678', 'DB': 'hainiucrawler', # 'CHARSET': 'utf8', 'PORT': 3306} d = DBUtil(config._HAINIU_DB) # d = DBUtil(_HAINIU_DB) r = redis.Redis('nn1.hadoop', 6379, db=6) # r = redis.Redis('redis.hadoop', 6379, db=6) f = FileUtil() t = TimeUtil() c = Client("http://nn1.hadoop:50070") time_str = t.now_time(format='%Y%m%d%H%M%S') # local_xpath_file_path = '/Users/leohe/Data/input/xpath_cache_file/xpath_file' + time_str local_xpath_file_path = '/home/qingniu/xpath_cache_file/xpath_file' + time_str start_cursor = 0 is_finish = True starttime = time.clock() host_set = set() while is_finish: values = set() limit = r.scan(start_cursor, 'total:*', 10) if limit[0] == 0: is_finish = False start_cursor = limit[0] for h in limit[1]: host = h.split(":")[1] total_key = h txpath_key = 'txpath:%s' % host fxpath_key = 'fxpath:%s' % host total = r.get(total_key) txpath = r.zrevrange(txpath_key, 0, 1) row_format = "%s\t%s\t%s\t%s" if txpath: # print 'txpath:%s' % txpath txpath_num = int(r.zscore(txpath_key, txpath[0])) if txpath.__len__() == 2: txpath_num_1 = int(r.zscore(txpath_key, txpath[1])) txpath_num_1 = txpath_num_1 if txpath_num_1 is not None else 0 # print 'txpath_max_num:%s' % txpath_num if txpath_num / float(total) >= 0.8: values.add(row_format % (host, txpath[0], 'true', '0')) host_set.add(host) else: if txpath_num >= 1: values.add(row_format % (host, txpath[0], 'true', '0')) host_set.add(host) if txpath_num_1 is not None and txpath_num_1 >= 1: values.add(row_format % (host, txpath[1], 'true', '0')) host_set.add(host) fxpath = r.smembers(fxpath_key) if fxpath: # print 'fxpath:%s' % fxpath for fx in fxpath: values.add(row_format % (host, fx, 'false', '0')) host_set.add(host) sql = select_xpath_rule_sql % host list_rule = d.read_tuple(sql) for rule in list_rule: type = rule[2] if type == 0: values.add(row_format % (rule[0], rule[1], 'true', '2')) host_set.add(host) elif type == 1: values.add(row_format % (rule[0], rule[1], 'false', '3')) host_set.add(host) f.write_file_line_pattern(local_xpath_file_path, values, "a") #上传到HDFS的XPATH配置文件目录 c.upload("/user/qingniu/xpath_cache_file/", local_xpath_file_path) endtime = time.clock() worksec = int(round((endtime - starttime))) rl.info('total host %s,action time %s\'s' % (host_set.__len__(), worksec)) except: rl.exception() d.rollback() finally: d.close()
file_object.writelines("\n".join(province_city_mapping_data)) file_object.close() print("========== 结束爬取中国城市新冠肺炎数据 ==========") print("========== 开始爬取新冠肺炎病人行径数据 ==========") track_list_data = get_patient_track_data() track_json_str_list = [] for i in range(len(track_list_data)): track_json_str_list.append(json.dumps(track_list_data[i], sort_keys=True, ensure_ascii=False)) # 写出新冠肺炎病人行径数据 file_object = open(output_path + 'feiyan_track_info.txt', 'w') file_object.writelines("\n".join(track_json_str_list)) file_object.close() print("========== 结束爬取新冠肺炎病人行径数据 ==========") print("========== 开始上传爬取数据到hdfs ==========") # 检查目录是否存在 hdfs_path_status = hdfs_client.status(hdfs_path=hdfs_path, strict=False) if hdfs_path_status is None: # 不存在就创建目录 hdfs_client.makedirs(hdfs_path=hdfs_path, permission="755") else: # 存在删除 hdfs_client.delete(hdfs_path,recursive=True) # 上传成功返回 hdfs_path hdfs_client.upload(hdfs_path=hdfs_path, local_path=output_path, chunk_size=2 << 25, progress=callback, cleanup=True) print("========== 结束上传爬取数据到hdfs ==========")
class HDFSClient: def __init__(self, url, root=None, user=None, proxy=None, timeout=None, session=None): """ 连接hdfs url: HDFS名称节点的主机名或IP地址及端口号 root: 根路径,此路径将作为传递给客户端的所有HDFS路径的前缀 user: 使用InsecureClient(Base Client),指定访问hdfs的用户;Client使用默认用户dr.who proxy: 代理的用户 timeout: 连接超时,转发到请求处理程序 session: request.Session实例,用于发出所有请求 """ if user: self.client = InsecureClient(url, user=user) else: self.client = Client(url, root=root, proxy=proxy, timeout=timeout, session=session) def list_hdfs_file(self, hdfs_path, status=False): """ 返回目录下的文件 status: 每个文件或目录的属性信息(FileStatus) return: 列表中包含元组,每个元组是目录名或文件名和属性信息构成 """ return self.client.list(hdfs_path, status=status) def walk_hdfs_file(self, hdfs_path, depth=0, status=False, ignore_missing=False, allow_dir_changes=False): """ 深度遍历远程文件系统 hdfs_path: 起始路径。如果该路径不存在,则会引发HdfsError。如果指向文件,则返回的生成器将为空 depth: 探索的最大深度。0为无限制 status: 同时返回每个文件或文件夹的相应FileStatus ignore_missing: 忽略缺少的嵌套文件夹,而不是引发异常 allow_dir_changes: 允许更改目录列表以影响遍历 return: 生成器,返回值参考python的walk函数 """ return self.client.walk(hdfs_path, depth=depth, status=status, ignore_missing=ignore_missing, allow_dir_changes=allow_dir_changes) def delete_hdfs_file(self, hdfs_path, recursive=False, skip_trash=False): """ 删除文件 recursive: 递归删除文件或目录,默认情况下,如果尝试删除非空目录,此方法将引发HdfsError skip_trash: 设置为false时,已删除的路径将被移动到适当的垃圾回收文件夹,而不是被删除 return: 如果删除成功,则此函数返回True;如果hdfs_path之前不存在文件或目录,则返回False """ return self.client.delete(hdfs_path, recursive=recursive, skip_trash=skip_trash) def download_hdfs_file(self, hdfs_path, local_path, overwrite=True, n_threads=1, temp_dir=None, **kwargs): """ 下载文件 hdfs_file: HDFS上要下载的文件或文件夹的路径。如果是文件夹,则将下载该文件夹下的所有文件 local_file: 本地路径。如果它已经存在并且是目录,则文件将在其中下载 overwrite: 覆盖任何现有文件或目录 n_threads: 用于并行化的线程数。值为0(或负数)将使用与文件一样多的线程 temp_dir: 当overwrite = True并且最终目标路径已经存在时,将首先在其下下载文件的目录。下载成功完成后,它将被交换 **kwargs: 关键字参数转发给read()。如果未传递chunk_size参数,则将使用默认值64 kB return: 方法执行成功,将返回本地下载路径 """ res = self.client.download(hdfs_path, local_path, overwrite=overwrite, n_threads=n_threads, temp_dir=temp_dir, **kwargs) def upload_hdfs_file(self, hdfs_path, local_path, n_threads=1, temp_dir=None, chunk_size=65536, progress=None, cleanup=True, **kwargs): """ 上传文件 hdfs_path: 目标HDFS路径。如果它已经存在并且是目录,则文件将在其中上传 local_path: 文件或文件夹的本地路径。如果是文件夹,则将上载其中的所有 文件(请注意,这意味着没有文件的文件夹将不会远程创建) cleanup: 如果上传过程中发生错误,删除所有上传的文件 return: 方法执行成功,将返回状态码,远程上传目录,错误信息 """ try: res = self.client.upload(hdfs_path, local_path, n_threads=n_threads, temp_dir=temp_dir, chunk_size=chunk_size, progress=progress, cleanup=cleanup, overwrite=True) return 0, res, '' except HdfsError as e: return 1, '', str(e) def makedirs(self, hdfs_path, permission=None): """ 创建目录,可以递归 permission: 在新创建的目录上设置的八进制权限,这些权限将仅在尚不存在的目录上设置 return: None """ self.client.makedirs(hdfs_path, permission=permission) def parts(self, hdfs_path, parts=None, status=False): """ hdfs_path: 远程路径。该目录每个分区最多应包含一个零件文件(否则将任意选择一个文件) parts: 零件文件编号列表或要选择的零件文件总数。如果是数字,那么将随机选择那么多分区。 默认情况下,将返回所有零件文件。如果部件是列表,但未找到部件之一或需要太多样本,则会引发HdfsError status: 返回文件的FileStatus return: 返回对应于路径的零件文件的字典 """ return self.client.parts(hdfs_path, parts=parts, status=status) def read_hdfs_file(self, **kwds): """ 读取文件内容,这个方法必须在一个with块中使用,以便关闭连接 >>> with client.read('foo') as reader: >>> content = reader.read() hdfs_path: HDFS路径 offset: 起始字节位置 length: 要处理的字节数。设置为None时会读取整个文件 buffer_size: 用于传输数据的缓冲区大小(以字节为单位)。默认为在HDFS配置中设置的值 encoding: 用于解码请求的编码。默认情况下,返回原始数据 chunk_size: 如果设置为正数,则上下文管理器将返回一个生成器,该生成器生成每个chunk_size字节, 而不是类似文件的对象(除非还设置了定界符) delimiter: 如果设置,上下文管理器将在每次遇到定界符时返回生成器。此参数要求指定编码 progress: 回调函数,用于跟踪进度,称为每个chunk_size字节(如果未指定块大小,则不可用)。 它将传递两个参数,即要上传的文件的路径和到目前为止已传输的字节数。 完成后,将以-1作为第二个参数调用一次 """ self.client.read(**kwds) def write_hdfs_file(self, hdfs_path, data=None, overwrite=False, permission=None, blocksize=None, replication=None, buffersize=None, append=False, encoding=None): """ 在HDFS上创建文件 data: 要写入的文件内容。 可以是字符串,生成器或文件对象。 最后两个选项将允许流式上传(即无需 将全部内容加载到内存中)。 如果为None,则此方法将返回类似文件的对象,应使用with块调用 它(请参见下面的示例) permission: 在新创建的文件上设置的八进制权限 append: 附加到文件而不是创建新文件 encoding: 用于序列化写入数据的编码 >>> from json import dump, dumps >>> records = [ >>> {'name': 'foo', 'weight': 1}, >>> {'name': 'bar', 'weight': 2}, >>> ] >>> # As a context manager: >>> with client.write('data/records.jsonl', encoding='utf-8') as writer: >>> dump(records, writer) >>> Or, passing in a generator directly: >>> client.write('data/records.jsonl', data=dumps(records), encoding='utf-8') """ self.client.write(hdfs_path, data=data, overwrite=overwrite, permission=permission, blocksize=blocksize, replication=replication, buffersize=buffersize, append=append, encoding=encoding) def rename_or_move(self, hdfs_src_path, hdfs_dst_path): """ 移动文件或目录 hdfs_src_path: 源路径 hdfs_dst_path: 目标路径,如果路径已经存在并且是目录,则源将移入其中。 如果路径存在并且是文件,或者缺少父目标目录,则此方法将引发HdfsError """ self.client.rename(hdfs_src_path, hdfs_dst_path) def set_owner(self, hdfs_path, owner=None, group=None): """ 更改文件的所有者,必须至少指定所有者和组之一 owner: 可选,文件的新所有者 group: 可选,文件的新所有组 """ self.client.set_owner(hdfs_path, owner=owner, group=group) def set_permission(self, hdfs_path, permission): """ 更改文件权限 permission: 文件的新八进制权限字符串 """ self.client.set_permission(hdfs_path, permission) def set_replication(self, hdfs_path, replication): """ 设置文件副本 replication: 副本数 """ self.client.set_replication(hdfs_path, replication) def set_times(self, hdfs_path, access_time=None, modification_time=None): """ 更改文件的时间戳 """ self.client.set_times(hdfs_path, access_time=access_time, modification_time=modification_time) def status_hdfs_file(self, hdfs_path, strict=True): """ 获取文件的FileStatus strict: 如果为False,则返回None,而不是如果路径不存在则引发异常 """ self.client.status(hdfs_path, strict=strict)
table_names = [] for i in cursor_mysql.fetchall(): table_names.append(i[0]) for table_name in table_names: os.system("mkdir -p /home/tmp/mysql_outfile/%s" % database) os.system("chown -R mysql:mysql /home/tmp/mysql_outfile/%s" % database) # os.system("rm -rf /home/tmp/mysql_outfile/%s"%(table_name)) cursor_mysql.execute( '''select * from %s into outfile '/home/tmp/mysql_outfile/%s/%s' fields terminated by '`' lines terminated by '^' ''' % (table_name, database, table_name)) # hdfs_client.delete('/tmp/external_tables/%s/%s'%(database_name,table_name)) hdfs_client.makedirs('/tmp/external_tables/%s/%s/' % (external_database, table_name)) hdfs_client.upload( '/tmp/external_tables/%s/%s/' % (external_database, table_name), '/home/tmp/mysql_outfile/%s/%s' % (database, table_name)) create_external_table = "create external table %s(" % (table_name) cursor_mysql.execute('desc %s' % (table_name)) table_desces = cursor_mysql.fetchall() table_desc_num = len(table_desces) for table_desc in table_desces: column_name = table_desc[0] if 'char' in table_desc[1]: column_type = 'string' elif 'varchar' in table_desc[1]: column_type = 'string' elif 'int' in table_desc[1]: column_type = 'int' elif 'bigint' in table_desc[1]: column_type = 'bigint'