def download(hdfs_path, local_path): # 初始化HDFS连接 client = Client('http://fisher.lazybone.xyz:9001', root='/') if os.path.exists(local_path): print('文件已存在') return client.download(hdfs_path=hdfs_path, local_path=local_path)
def get_data(): client = Client("http://t3.dev:50070", "hadoop") # client = InsecureClient(url="http://t3.dev:50070", user="******", root="/") print(client.list("/huiqu/common/area.txt")) with client.read("/huiqu/common/area.txt/part-00000") as read: # print(read.read().decode('utf8')) return {"data": read.read().decode('utf8')}
def save_to_storage(filename, sha1): if args.no_save: return False from hdfs import Client client = Client('http://{}:50070'.format(config.get("hdfs", "host")), root=config.get("reportparser", "storage")) uploaded = client.upload(sha1, filename, overwrite=True) return uploaded
def main(): client = Client("http://127.0.0.1:50070", root="/", timeout=100, session=False) #client.makedirs("/news") client.upload("/input", "x.html") print(client.list("/"))
def get(self, request): client = Client(HDFS_HOST) hdfs = request.GET.get('hdfs') file_name = DataSource.objects.get(format_filename=hdfs).file_name client.download('/datahoop/' + hdfs, settings.MEDIA_ROOT + 'hdfs_download') path = os.path.join(settings.MEDIA_ROOT, 'hdfs_download') file = open(os.path.join(path, hdfs), 'rb') response = FileResponse(file) response = HttpResponse(content_type='application/vnd.ms-csv') response['Content-Disposition'] = 'attachment; filename={}.csv'.format(file_name.split('.')[0]) return (response)
class BiliaprioriPipeline(object): def __init__(self): self.client = Client("http://fantome:50070") def process_item(self, item, spider): print("Get Tag", " tag:", item['tagInfo']) #持续写入 self.client.write('/bili_3-7day/tagInfo.txt', item['tagInfo'] + "\n", overwrite=False, append=True, encoding="utf-8")
def upload(name, file_path, config): env_prefix = config.get("prefix", None) hdfs_client = Client(url=config["hdfs"]["name_node"]) hdfs_hosts = [] hdfs_http_host = config["hdfs"]["name_node"] hdfs_hosts.append(hdfs_http_host.replace("http://", "")) hdfs_data_service_root = "/data_service" if env_prefix is not None: hdfs_data_service_root = "/{0}_data_service".format(env_prefix) hdfs_client.makedirs(hdfs_data_service_root) timestamp = int(round(time.time() * 1000)) target_file_name = "{2}/{0}/{1}/{0}_{1}.py".format( name, str(timestamp), hdfs_data_service_root) hdfs_client.makedirs("{2}/{0}/{1}".format(name, str(timestamp), hdfs_data_service_root)) print("hdfs file name: {0}".format(target_file_name)) hdfs_client.upload(target_file_name, file_path) zip_path = os.path.join( os.path.dirname(os.path.dirname(os.path.dirname(__file__))), "joowing.zip") target_zp_file_name = "{2}/{0}/{1}/joowing.zip".format( name, str(timestamp), hdfs_data_service_root) # hdfs_client.upload(target_zp_file_name, zip_path) # return target_file_name, target_zp_file_name return target_file_name
def __init__(self, **kwargs): self.table_cols_map = {} # 表字段顺序 {table:(cols, col_default)} self.bizdate = bizdate # 业务日期为启动爬虫的日期 self.buckets_map = {} # 桶 {table:items} self.bucketsize = kwargs.get('BUCKETSIZE') self.client = Client(kwargs.get('HDFS_URLS')) self.dir = kwargs.get('HDFS_FOLDER') # 文件夹路径 self.delimiter = kwargs.get('HDFS_DELIMITER') # 列分隔符,默认 hive默认分隔符 self.encoding = kwargs.get('HDFS_ENCODING') # 文件编码,默认 'utf-8' self.hive_host = kwargs.get('HIVE_HOST') self.hive_port = kwargs.get('HIVE_PORT') self.hive_dbname = kwargs.get('HIVE_DBNAME') # 数据库名称 self.hive_auto_create = kwargs.get('HIVE_AUTO_CREATE', False) # hive 是否自动建表,默认 False self.client.makedirs(self.dir)
def __init__(self, config, expiration=30000): def get_token(username, password, expiration): ### # input_type: str, str, int # input: the username of PAI, the password of PAI and the expiration time of the token # output_type: str # output: token # Get the token from rest server API ### token_ready = False loop_count = 0 while not token_ready: time.sleep(loop_count) loop_count += 1 http_object = self.http.request( 'POST', self.rest_server_url + 'token', headers={ 'Content-Type': 'application/x-www-form-urlencoded', }, body='username='******'&password='******'&expiration=' + str(expiration)) if http_object.status == 200: token_ready = True return json.loads( http_object.data.decode('utf-8'))['token'] else: print(http_object.status, http_object.data) self.rest_server_url = config.rest_server_url # rest server url self.http = urllib3.PoolManager() # urllib3 http self.token = get_token(config.PAI_username, config.PAI_password, expiration) # rest Server token self.hdfs_client = Client(config.webhdfs_url) # hdfs web url
def reflect(self, reload_flag = False): if not self.ds_dict: si_app.logger.error('error: ds_dict must be provided.') return if reload_flag: si_app.delete_doc_from_index_by_datasource(ds_name=self.ds_dict['ds_name']) tclient = Client( self.ds_dict['ds_param']['hdfs_web_url']) # self.ds_dict['ds_url']) path_hdfs = self.ds_dict['ds_param']['root_path'] # self.ds_dict['table_group_name'] if path_hdfs[-1] == '/': # remove trailing '/' for concatenating more path path_hdfs = path_hdfs[:-1] # remove the trailing '/' filelist = self.getHDFSFileInfo(tclient, path_hdfs) for fd in filelist: si_app.add_table_content_index(ds_name = self.ds_dict['ds_name'], table_id=fd['table_name'], table_info=(json.dumps(fd) ), table_content_index = ' '.join([fd[k] for k in fd.keys() ]) )
def generate_job_and_outputtable(schedules_list): job_list = [] outtable_list = [] client = Client("http://emr2-header-1.ipa.aidigger.com:50070", timeout=30) for work_id, schedule_id, cron_type in schedules_list: schedule_url = "https://pony.aidigger.com/api/v1/schedules/{}".format( schedule_id) job_infos = requests.get(schedule_url).json()["data"] time.sleep(1) owner = job_infos["owner"] print("schedule_url: " + schedule_url) for job_info in job_infos["execute_DAG"]: try: if job_info.get("job_info") \ and job_info["job_info"]["configs"].get("command","") \ and job_info["job_info"]["configs"]["command"].startswith("data_pipeline") : #or job_info["job_info"]["configs"]["command"].startswith("data_connector") config = job_info["job_info"]["configs"] if "args" in config.keys(): config['args']["isstreaming"] = str( config['args']["isStreaming"] if "isStreaming" in config['args'].keys( ) else config['args']["isstreaming"]) config['args'].get("spark_conf", {}).get( "dependency", {}).pop("data_pipeline", None) config['args'].pop("KafkaCheckpoint", None) job_list.append( (work_id, config["job_id"], job_info["name"], job_info["job_info"]["configs"]["command"], "1G", "0.3", owner, cron_type)) for output in config["output"]: outtable_list.append(deepcopy(output)) dayu_fullnames = output["dayu_fullname"].split(":") if not dayu_fullnames: raise Exception("error!!") if dayu_fullnames[0].lower() == "hive": dayu_fullnames[1] = "dayu_temp" output["dayu_full_name"] = ":".join( dayu_fullnames) + "_k8spre" elif dayu_fullnames[0].lower().startswith("oss"): output["dayu_full_name"] = output[ "dayu_fullname"][:-1] + "_k8spre/" elif dayu_fullnames[0].lower().startswith("kafka"): output["dayu_full_name"] = output[ "dayu_fullname"] + "_k8spre" output["dayu_full_name"] = output[ "dayu_full_name"].replace(".", "_") else: output["dayu_full_name"] = output[ "dayu_fullname"] + "_k8spre" output.pop("dayu_id") content = json.dumps(config).encode(encoding='utf-8') client.write("/tmp/ting.wu/k8s_press/{}.json".format( config["job_id"]), overwrite=True, data=content) print(" hdfs: /tmp/ting.wu/k8s_press/{}.json".format( config["job_id"])) except Exception as err: print(err) return job_list, outtable_list
def __init__(self, spark, config, generator): super(DataContext, self).__init__() self.spark = spark self.config = config self.generator = generator self.env_prefix = config.get("prefix", None) self.hdfs_client = Client(url=";".join(config["hdfs"]["name_node"]), proxy="joowing")
def from_settings(cls, settings): hdfs_master = settings['HDFS_MASTER'] hdfs_address = settings['HDFS_ADDRESS'] try: client = Client('http://' + str(hdfs_master) + ':' + str(hdfs_address)) except Exception as e: print(e) return cls(client)
def connect_and_login(self, **kwargs): import requests host = None port = None user = None password = None root = None timeout = None proxy = None if 'host' in kwargs: host = kwargs['host'] if 'port' in kwargs: port = kwargs['port'] if 'kdc' in kwargs: kdc = kwargs['kdc'] if 'user' in kwargs: user = kwargs['user'] if 'password' in kwargs: password = kwargs['password'] if 'root' in kwargs: root = kwargs['root'] if 'proxy' in kwargs: proxy = kwargs['proxy'] if 'timeout' in kwargs: timeout = kwargs['timeout'] self.session = requests.Session() adapter = requests.adapters.HTTPAdapter(pool_maxsize=0) self.session.mount('http://', adapter) self.session.mount('https://', adapter) self.session.headers.update({'Connection':'Keep-Alive'}) self.connectionStatus = False try: timeout = int(timeout) url = "http://" + host + ":" + str(port) hdfsLogin = WebHDFS(url, kdc) cookieStr = hdfsLogin.authenticate(user, password) if cookieStr != None: cookieList = cookieStr.split('=', 1) cookieDict = {cookieList[0]: cookieList[1]} requests.utils.add_dict_to_cookiejar(self.session.cookies, cookieDict) self.client = Client(url, root=root, proxy=proxy, timeout=timeout, session=self.session) except HdfsError as hdfsError: raise HdfsLibraryError(str(hdfsError)) except Exception as exception: raise HdfsLibraryError(str(exception)) self.connectionStatus = True return self.client
def __init__(self, config: dict = None, file: str = 'openpai.json'): """config should contain - rest_server_socket - hdfs_web_socket - user - password """ if config is None: with open(file) as fn: config = json.load(fn) for key in [ 'rest_server_socket', 'hdfs_web_socket', 'user', 'password' ]: assert key in config, '%s is not defined for OpenPAI' % (key) for key in ['rest_server_socket', 'hdfs_web_socket']: assert config[key].startswith( 'http://'), '%s should have http prefix' % (key) self.rest_server_socket = config['rest_server_socket'] self.hdfs_client = Client(config['hdfs_web_socket']) self.config = config
class HdfsClient: """基于hdfs library实现的hdfs客户端 """ def __init__(self, host, port=50070): self.url = "http://%s:%d" % (host, port) self.client = Client(url=self.url) def isExists(self, hdfs_path): try: status = self.client.acl_status(hdfs_path, strict=False) if status != None: info = "file or directory %s is existed." % hdfs_path return (0, info) else: info = "file or directory %s not existed." % hdfs_path return (1, info) except Exception, e: info = "HDFS isExists:{}".format(str(e)) return (2, info)
def __init__(self, config, expiration=30000): def get_token(username, password, expiration): ### # input_type: str, str, int # input: the username of PAI, the password of PAI and the expiration time of the token # output_type: str # output: token # Get the token from rest server API ### rest_server_url_without_namespace = '/'.join( self.rest_server_url.split('/')[:-3]) + '/' token_ready = False loop_count = 0 while not token_ready: time.sleep(loop_count) loop_count += 1 http_object = self.http.request( 'POST', rest_server_url_without_namespace + 'token', headers={ 'Content-Type': 'application/json', }, body=json.dumps({ 'username': username, 'password': password, 'expiration': str(expiration) })) if http_object.status == 200: token_ready = True return json.loads( http_object.data.decode('utf-8'))['token'] else: print(http_object.status, http_object.data) self.rest_server_url = config.rest_server_url # rest server url self.http = urllib3.PoolManager() # urllib3 http self.token = get_token(config.PAI_username, config.PAI_password, expiration) # rest Server token self.hdfs_client = Client(config.webhdfs_url) # hdfs web url
def get(self, request, *args, **kwargs): table_name = request.data.get('name') username = request.session['username'] password = request.session['password'] host = request.session['host'] port = request.session['port'] database_name = request.session['dbdatabase_name'] obj = DataSource.objects conn = pymssql.connect(database=database_name, user=username, password=password, host=host, port=port) client = Client(HDFS_HOST) cur = conn.cursor() for i in table_name: global rels cur.execute("select name from syscolumns where id = object_id('%s');" % (i)) rels = [] rel = [] rows = cur.fetchall() for i in rows: for item in i: rel.append(item) rels.append(rel) # 类似于其他语言的 query 函数, execute 是 python 中的执行查询函数 cur.execute("SELECT * FROM %s" % (i)) # 使用 fetchall 函数,将结果集(多维元组)存入 rows 里面 rows1 = cur.fetchall() # 依次遍历结果集,发现每个元素,就是表中的一条记录,用一个元组来显示 for row in rows1: rels.append(list(row)) file_name = i + '.sql' format_name = uuid.uuid1() filepath = settings.MEDIA_ROOT + format_name with open(filepath, 'wb+') as writer: for chunk in rels: writer.write(chunk) client.upload("/datahoop", filepath) obj.create(file_name=file_name, format_name=format_name, user_id=1) os.remove(filepath) client.close() cur.close() conn.close() return HttpResponse(json.dumps(rels), content_type='application/json')
def delete(self, request, *args, **kwargs): file_id = request.data.get('file_id') where = DataSource.objects.get(id=file_id).where if where == 'hdfs': file = DataSource.objects.get(id=file_id) hdfs_name = DataSource.objects.get(id=file_id).format_filename client = Client(HDFS_HOST) client.delete('/datahoop/' + hdfs_name, recursive=True) file.delete() else: client = pymongo.MongoClient(settings.MONGO_DB_HOST, settings.MONGO_DB_PORT) db = client.datahoop.data file_id = DataSource.objects.filter(id=id).first() obj_id = file_id.obj_id file_id.delete() db.remove({"_id": ObjectId(obj_id)}) client.close() return HttpResponse(content_type='application/json')
def get(self, request, *args, **kwargs): # table_name = request.data.get('name') table_name = 'files_datasource' username = request.session['username'] password = request.session['password'] host = request.session['host'] port = request.session['port'] database_name = request.session['database_name'] obj = DataSource.objects con = pymysql.connect(host, username, password, database_name) client = Client(HDFS_HOST) cur = con.cursor() # for i in table_name: sql = "select DISTINCT (COLUMN_NAME) from information_schema.COLUMNS where table_name = '%s'" cur.execute(sql % (table_name)) rows = cur.fetchall() rels = [] rel = [] for i in rows: rel.append(i[0]) rels.append(rel) # 类似于其他语言的 query 函数, execute 是 python 中的执行查询函数 cur.execute("SELECT * FROM %s" % (table_name)) # 使用 fetchall 函数,将结果集(多维元组)存入 rows 里面 rows = cur.fetchall() # 依次遍历结果集,发现每个元素,就是表中的一条记录,用一个元组来显示 for row in rows: rels.append(list(row)) file_name = table_name + '.sql' format_name = uuid.uuid1() filepath = settings.MEDIA_ROOT + str(format_name) with open(filepath, 'wb+') as writer: for chunk in rels: writer.write(chunk) client.upload("/datahoop", filepath) obj.create(file_name=file_name, format_name=format_name, user_id=1) os.remove(filepath) client.close() con.close() cur.close() return HttpResponse(json.dumps(rels), content_type='application/json')
def get(self, request): # delete mydata file_id = request.GET.get('file_id') try: where = DataSource.objects.get(id=file_id).where print(DataSource.objects.get(id=file_id)) print(where) format_filename = DataSource.objects.get( id=file_id).format_filename format_name_count = DataSource.objects.filter( format_filename=format_filename).count() if where == 'hdfs' and format_name_count == 1: file = DataSource.objects.get(id=file_id) hdfs_name = DataSource.objects.get(id=file_id).format_filename client = Client(HDFS_HOST) client.delete('/datahoop/' + hdfs_name, recursive=True) file.delete() item = Collect.objects.filter(file_id=file_id) if item: item.delete() elif where == 'hdfs' and format_name_count > 1: file = DataSource.objects.get(id=file_id) file.delete() item = Collect.objects.filter(file_id=file_id) if item: item.delete() else: client = pymongo.MongoClient(settings.MONGO_DB_URI) db = client.datahoop.data data_obj = DataSource.objects.filter(id=file_id).first() obj_id = data_obj.obj_id data_obj.delete() db.remove({"_id": ObjectId(obj_id)}) client.close() item = Collect.objects.filter(file_id=file_id) if item: item.delete() return JsonResponse({'status': True}) except: return JsonResponse({'status': False})
def build_connection(self): self.client = Client(self.hadoop_url)
from hdfs import Client client = Client("http://master:9870") #client.makedirs("/abc/xyz") x = client.list("/") y = client.list("/", status=True)
class HdfsPipeline(object): def __init__(self, **kwargs): self.table_cols_map = {} # 表字段顺序 {table:(cols, col_default)} self.bizdate = bizdate # 业务日期为启动爬虫的日期 self.buckets_map = {} # 桶 {table:items} self.bucketsize = kwargs.get('BUCKETSIZE') self.client = Client(kwargs.get('HDFS_URLS')) self.dir = kwargs.get('HDFS_FOLDER') # 文件夹路径 self.delimiter = kwargs.get('HDFS_DELIMITER') # 列分隔符,默认 hive默认分隔符 self.encoding = kwargs.get('HDFS_ENCODING') # 文件编码,默认 'utf-8' self.hive_host = kwargs.get('HIVE_HOST') self.hive_port = kwargs.get('HIVE_PORT') self.hive_dbname = kwargs.get('HIVE_DBNAME') # 数据库名称 self.hive_auto_create = kwargs.get('HIVE_AUTO_CREATE', False) # hive 是否自动建表,默认 False self.client.makedirs(self.dir) @classmethod def from_crawler(cls, crawler): settings = crawler.settings return cls(**settings) def process_item(self, item, spider): """ :param item: :param spider: :return: 数据分表入库 """ if item.tablename in self.buckets_map: self.buckets_map[item.tablename].append(item) else: cols, col_default = [], {} for field, value in item.fields.items(): cols.append(field) col_default[field] = item.fields[field].get('default', '') cols.sort(key=lambda x: item.fields[x].get('idx', 1)) self.table_cols_map.setdefault( item.tablename, (cols, col_default)) # 定义表结构、字段顺序、默认值 self.buckets_map.setdefault(item.tablename, [item]) if self.hive_auto_create: self.checktable(item.tablename, cols) # 建表 self.buckets2db(bucketsize=self.bucketsize, spider_name=spider.name) # 将满足条件的桶 入库 return item def close_spider(self, spider): """ :param spider: :return: 爬虫结束时,将桶里面剩下的数据 入库 """ self.buckets2db(bucketsize=1, spider_name=spider.name) def checktable(self, tbname, cols): """ :return: 创建 hive 表 """ hive = CtrlHive(self.hive_host, self.hive_port, self.hive_dbname) cols = ['keyid'] + cols + ['bizdate', 'ctime', 'spider'] create_sql = f"create table if not exists {tbname}({' string,'.join(cols)} string)" hive.execute(create_sql) logger.info(f"表创建成功 <= 表名:{tbname}") def buckets2db(self, bucketsize=100, spider_name=''): """ :param bucketsize: 桶大小 :param spider_name: 爬虫名字 :return: 遍历每个桶,将满足条件的桶,入库并清空桶 """ for tablename, items in self.buckets_map.items( ): # 遍历每个桶,将满足条件的桶,入库并清空桶 if len(items) >= bucketsize: new_items = [] cols, col_default = self.table_cols_map.get(tablename) for item in items: keyid = rowkey() new_item = {'keyid': keyid} for field in cols: value = item.get(field, col_default.get(field)) new_item[field] = str(value).replace( self.delimiter, '').replace('\n', '') new_item['bizdate'] = self.bizdate # 增加非业务字段 new_item['ctime'] = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) new_item['spider'] = spider_name value = self.delimiter.join(new_item.values()) new_items.append(value) # 每张表是都是一个文件夹 folder = f"{self.dir}/{tablename}" self.client.makedirs(folder) filename = f"{folder}/data.txt" info = self.client.status(filename, strict=False) if not info: self.client.write(filename, data='', overwrite=True, encoding=self.encoding) try: content = '\n'.join(new_items) + '\n' self.client.write(filename, data=content, overwrite=False, append=True, encoding=self.encoding) logger.info(f"保存成功 <= 文件名:{filename} 记录数:{len(items)}") items.clear() # 清空桶 except Exception as e: logger.error(f"保存失败 <= 文件名:{filename} 错误原因:{e}")
#daoru from hdfs import Client client = Client("http://master:9870") # client.makedirs("/abc/xyz") x = client.list("/") y = client.list("/", status=True) print(y[1][0]) print(y[1][1]["accessTime"]) client.upload("/abc", "HDFSDao.py") client.download("/abc/HDFSDao.py", "d:/ttt.py") print("end___")
def __init__(self, host, port=50070): self.url = "http://%s:%d" % (host, port) self.client = Client(url=self.url)
class RF_HDFS(object): def __init__(self): self.client = None self.directory = None def connect_and_login(self, **kwargs): import requests host = None port = None user = None password = None root = None timeout = None proxy = None if 'host' in kwargs: host = kwargs['host'] if 'port' in kwargs: port = kwargs['port'] if 'kdc' in kwargs: kdc = kwargs['kdc'] if 'user' in kwargs: user = kwargs['user'] if 'password' in kwargs: password = kwargs['password'] if 'root' in kwargs: root = kwargs['root'] if 'proxy' in kwargs: proxy = kwargs['proxy'] if 'timeout' in kwargs: timeout = kwargs['timeout'] self.session = requests.Session() adapter = requests.adapters.HTTPAdapter(pool_maxsize=0) self.session.mount('http://', adapter) self.session.mount('https://', adapter) self.session.headers.update({'Connection':'Keep-Alive'}) self.connectionStatus = False try: timeout = int(timeout) url = "http://" + host + ":" + str(port) hdfsLogin = WebHDFS(url, kdc) cookieStr = hdfsLogin.authenticate(user, password) if cookieStr != None: cookieList = cookieStr.split('=', 1) cookieDict = {cookieList[0]: cookieList[1]} requests.utils.add_dict_to_cookiejar(self.session.cookies, cookieDict) self.client = Client(url, root=root, proxy=proxy, timeout=timeout, session=self.session) except HdfsError as hdfsError: raise HdfsLibraryError(str(hdfsError)) except Exception as exception: raise HdfsLibraryError(str(exception)) self.connectionStatus = True return self.client def checkConnectionStatus(self): return self.connectionStatus def list_dir(self, directory): output = [] try: if directory != None: output = self.client.list(directory, status=True) else: output = self.client.list(self.client.root, status=True) except HdfsError as hdfsError: raise HdfsLibraryError(str(hdfsError)) except Exception as exception: raise HdfsLibraryError(str(exception)) return output def list_names(self, directory): output = [] try: if directory != None: output = self.client.list(directory, status=False) else: output = self.client.list(self.client.root, status=False) except HdfsError as hdfsError: raise HdfsLibraryError(str(hdfsError)) except Exception as exception: raise HdfsLibraryError(str(exception)) return output def upload(self, remote_path, local_path, overwrite=False, permission=None): output = None try: output = self.client.upload(remote_path, local_path, overwrite, permission=permission) except HdfsError as hdfsError: # For some reason this exception includes the entire stack trace after # the error message, so split on '\n' and only return the first line. error = str(hdfsError).splitlines()[0] raise HdfsLibraryError(error) except Exception as exception: raise HdfsLibraryError(str(exception)) return output def download(self, remote_path, local_path, overwrite=False): output = None try: output = self.client.download(remote_path, local_path, overwrite) except HdfsError as hdfsError: raise HdfsLibraryError(str(hdfsError)) except Exception as exception: raise HdfsLibraryError(str(exception)) return output def mkdir(self, directory, permission): try: # no return value self.client.makedirs(directory, permission=permission) except HdfsError as hdfsError: raise HdfsLibraryError(str(hdfsError)) except Exception as exception: raise HdfsLibraryError(str(exception)) def rmdir(self, directory): try: # no return value if self.client.delete(directory, recursive=True) == False: raise HdfsLibraryError("Directory does not exist: %r", directory) except HdfsError as hdfsError: raise HdfsLibraryError(str(hdfsError)) except Exception as exception: raise HdfsLibraryError(str(exception)) def rename(self, src_file, dst_file): try: # no return value self.client.rename(src_file, dst_file) except HdfsError as hdfsError: raise HdfsLibraryError(str(hdfsError)) except Exception as exception: raise HdfsLibraryError(str(exception)) def delete(self, file): try: # no return value if self.client.delete(file) == False: raise HdfsLibraryError("File does not exist: %r", file) except HdfsError as hdfsError: raise HdfsLibraryError(str(hdfsError)) except Exception as exception: raise HdfsLibraryError(str(exception)) def set_time(self, file, mod_time): try: # no return value self.client.set_times(file, -1, mod_time) except HdfsError as hdfsError: raise HdfsLibraryError(str(hdfsError)) except Exception as exception: raise HdfsLibraryError(str(exception)) def set_owner(self, file, owner, group): try: # no return value self.client.set_owner(file, owner=owner, group=group) except HdfsError as hdfsError: raise HdfsLibraryError(str(hdfsError)) except Exception as exception: raise HdfsLibraryError(str(exception)) def set_permission(self, file, permission): try: # no return value self.client.set_permission(file, permission=permission) except HdfsError as hdfsError: raise HdfsLibraryError(str(hdfsError)) except Exception as exception: raise HdfsLibraryError(str(exception)) def set_acl(self, file, aclspec): try: # no return value self.client.set_acl(file, aclspec=aclspec) except HdfsError as hdfsError: raise HdfsLibraryError(str(hdfsError)) except Exception as exception: raise HdfsLibraryError(str(exception)) def status(self, path): output = '' try: output = self.client.status(path) except HdfsError as hdfsError: raise HdfsLibraryError(str(hdfsError)) except Exception as exception: raise HdfsLibraryError(str(exception)) return output def checksum(self, path): output = '' try: output = self.client.checksum(path) except HdfsError as hdfsError: raise HdfsLibraryError(str(hdfsError)) except Exception as exception: raise HdfsLibraryError(str(exception)) return output def close(self): self.session.close()
def __init__(self): self.client = Client("http://fantome:50070")
import pandas as pd import os from hdfs import Client # 目前读取hdfs文件采用方式: # 1. 先从hdfs读取二进制数据流文件 # 2. 将二进制文件另存为.csv # 3. 使用pandas读取csv文件 HDFSHOST = "http://172.16.18.112:50070" train_path = "/9a7e7ef5a78f4f8388deff28cc5c2115/dataSet/devdatasetdef19761" test_path = "/9a7e7ef5a78f4f8388deff28cc5c2115/dataSet/devdatasetdef19762" train_FILENAME = train_path + "/data/Data.csv" #hdfs文件路径 test_FILENAME = test_path + "/data/Data.csv" #hdfs文件路径 client = Client(HDFSHOST) with client.read(train_FILENAME) as tr_s: tr_content = tr_s.read() tr_s = str(tr_content, 'utf-8') # 确保文件写入完毕 tr_file = open("trainData.csv", "w") tr_file.flush() os.fsync(tr_file) tr_file.write(tr_s) tr_file.close() # 读取文件 df_train = pd.read_csv("trainData.csv", header=0) print(df_train) with client.read(test_FILENAME) as te_fs: te_content = te_fs.read() te_s = str(te_content, 'utf-8')
print(rowkey) mutations = [Mutation(column=self.columnFamily + ":ipaddr", value=ipaddr), \ Mutation(column=self.columnFamily + ":visitTime", value=visitTime), \ Mutation(column=self.columnFamily + ":user_id", value=user_id), \ Mutation(column=self.columnFamily + ":link", value=link) ] # 一次提交多行 mutations_batch.append( BatchMutation(row=rowkey, mutations=mutations)) if len(mutations_batch) % batch_size == 0: self.client.mutateRows(self.tablename, mutations_batch) mutations_batch = [] if __name__ == "__main__": # 建立hbase连接 hbasewriteer = CreateTableAndImportData('user_log_info', 'cf_log') hbasewriteer.createTable() # 连接HDFS client = Client(HDFSNN) # 获取文件列表 logFiles = client.list(LOGPATH) # 读取文件 for logfile in logFiles: with client.read(os.path.join(LOGPATH, logfile)) as deal_file_handle: hbasewriteer.importData(deal_file_handle)
class ChatBotModel(object): def __init__(self, hadoop_url, hdfs_index_file, local_index_file, corpus_dir, unk_answer='', max_answer_len=1024): self.hadoop_url = hadoop_url self.hdfs_index_file = hdfs_index_file self.local_index_file = local_index_file self.corpus_dir = corpus_dir self.max_answer_len = max_answer_len self.unk_answer = unk_answer self.client = None self.inverted_index = {} def build_connection(self): self.client = Client(self.hadoop_url) def fetch_index_file(self): self.client.download(hdfs_path=self.hdfs_index_file, local_path=self.local_index_file, overwrite=True) def load_inverted_index(self): with open(self.local_index_file, 'r', encoding='utf-8') as f: for line in f: word, *querys = line.strip().split('\t') for query in querys: file_name, query_id, score = query.split(':') if word in self.inverted_index: self.inverted_index[word].append( [file_name, int(query_id), float(score)]) else: self.inverted_index[word] = [] self.inverted_index[word].append( [file_name, int(query_id), float(score)]) def prepare(self): self.build_connection() self.fetch_index_file() self.load_inverted_index() def read_corpus_answer(self, file_name, query_id): file_path = os.path.join(self.corpus_dir, file_name) file_status = self.client.status(file_path) if file_status['length'] <= query_id: return None with self.client.read(hdfs_path=file_path, offset=query_id, length=self.max_answer_len, encoding='utf-8') as f: answer = f.read().strip().split('\n')[0] return answer def predict_answer(self, query): words = jieba.lcut_for_search(query) querys = {} for word in words: if word not in self.inverted_index: continue for file_name, query_id, score in self.inverted_index[word]: query = (file_name, query_id) if query in querys: querys[query] += score else: querys[query] = score if len(querys) == 0: return self.unk_answer best_query = max(querys.items(), key=lambda x: x[1]) (best_file_name, best_query_id), best_score = best_query best_answer = self.read_corpus_answer(best_file_name, best_query_id) if best_answer is None: best_answer = self.unk_answer return best_answer
def train(train_path, test_path, output_path, target, train_split_ratio=0.33, penalty='l2', dual=False, tol=1e-4, C=1.0, random_state=None, multi_class='ovr'): # 设置起始时间 time.localtime() time_trains_start = time.strftime('%Y{y}%m{m}%d{d} %H{h}%M{f}%S{s}'.format( y='/', m='/', d='', h=':', f=':', s='')) start_time = time.time() # 设置输入文件路径 train_FILENAME = train_path + "/data/Data.csv" # hdfs文件路径 test_FILENAME = test_path + "/data/Data.csv" # hdfs文件路径 client = Client(HDFS_HOSTS1) # 训练数据读取 with client.read(train_FILENAME) as tr_s: tr_content = tr_s.read() tr_s = str(tr_content, 'utf-8') # 确保文件写入完毕 tr_file = open("trainData.csv", "w") tr_file.flush() os.fsync(tr_file) tr_file.write(tr_s) tr_file.close() df_train = pd.read_csv("trainData.csv", header=0) print(df_train) # 测试数据读取 with client.read(test_FILENAME) as te_fs: te_content = te_fs.read() te_s = str(te_content, 'utf-8') # 确保文件写入完毕 te_file = open("testData.csv", "w") te_file.flush() os.fsync(te_file) te_file.write(te_s) te_file.close() df_test = pd.read_csv("testData.csv", header=0) print(df_test) min_max_scaler = preprocessing.MinMaxScaler(feature_range=(-1, 1)) test_data_num = df_train.shape[0] train_data_num = df_train.shape[0] # 处理预测集 df_test = min_max_scaler.fit_transform(df_test) df_test = np.array(df_test) # 数据处理和清洗 cols = [tmp_i for tmp_i in df_train.columns if tmp_i not in [target]] X = df_train[cols] X = np.array(X) X = min_max_scaler.fit_transform(X) Y = df_train[target] Y = np.array(Y) # 训练集数据分割 X_train, X_test, Y_train, Y_test = train_test_split( X, Y, test_size=train_split_ratio) # 使用 scikit learn 中的LR模型进行训练 clf = LogisticRegression(penalty, dual, tol, C, random_state, multi_class, solver='liblinear') clf.fit(X_train, Y_train) # 准确率train_acc train_acc = clf.score(X_test, Y_test) print('score Scikit learn: ', train_acc) # 精确率train_precision_score train_precision_score = precision_score(Y_test, clf.predict(X_test)) # 召回率train_recall_score train_recall_score = recall_score(Y_test, clf.predict(X_test)) # F1_Score train_f1_score = f1_score(Y_test, clf.predict(X_test)) # roc_auc_score train_roc_auc_score1 = roc_auc_score(Y_test, clf.predict(X_test)) # 使用 scikit learn 中的LR模型进行预测 result = clf.predict(df_test) # print(result) # 设置终止时间,并计算总时间 train_end = time.time() train_seconds = train_end - start_time m, s = divmod(train_seconds, 60) h, m = divmod(m, 60) time_trains_all = "%02d:%02d:%02d" % (h, m, s) # ++++++++++++++++++++++++++++++++++++++++训练结果保存+++++++++++++++++++++++++++++++++++++++# ## 保存摘要模型报告文件 # abstract_path = HDFS_HOSTS1 + output_path + '/abstract/data/' abstract_path = output_path + '/abstract/data/' f = open('abstract.csv', mode='w', newline='') fileheader = [ 'FrameWork', 'Version', 'model', 'accuracy', 'time_trains_start', 'time_trains_all', 'test_data_num', 'train_data_num' ] w = csv.DictWriter(f, fileheader) w.writeheader() csv_dict = edict() csv_dict.FrameWork = 'Scikit-learn' csv_dict.Version = sklearn.__version__ csv_dict.model = '%s' % LogisticRegression csv_dict.accuracy = str(train_acc) csv_dict.time_trains_start = time_trains_start csv_dict.time_trains_all = time_trains_all csv_dict.test_data_num = str(test_data_num) csv_dict.train_data_num = str(train_data_num) w.writerow(csv_dict) f.close() client.delete(abstract_path + 'abstract.csv') client.upload(abstract_path + 'abstract.csv', 'abstract.csv') # if len(client.list(abstract_path)): # client.delete(abstract_path + 'abstract.csv') # client.upload(abstract_path + 'abstract.csv', 'abstract.csv') # else: # client.upload(abstract_path + 'abstract.csv', 'abstract.csv') ##保存模型版本信息csv文件 version_path = output_path + '/msg/data/' f = open('msg.csv', mode='w', newline='') fileheader = [ 'accuracy', 'time_trains_start', 'time_trains_all', 'test_data_num', 'train_data_num' ] w = csv.DictWriter(f, fileheader) w.writeheader() csv_dict = edict() csv_dict.accuracy = str(train_acc) csv_dict.time_trains_start = time_trains_start csv_dict.time_trains_all = time_trains_all csv_dict.test_data_num = str(test_data_num) csv_dict.train_data_num = str(train_data_num) w.writerow(csv_dict) f.close() client.delete(version_path + 'msg.csv') client.upload(version_path + 'msg.csv', 'msg.csv') ## 保存训练评价指标模型报告文件 file_csv_path = output_path + '/evaluation/data/' f = open('evaluation.csv', mode='w', newline='') fileheader = [ 'accuracy', 'train_precision_score', 'train_recall_score', 'train_f1_score', 'train_roc_auc_score1' ] w = csv.DictWriter(f, fileheader) w.writeheader() csv_dict = edict() csv_dict.accuracy = str(train_acc) csv_dict.train_precision_score = train_precision_score csv_dict.train_recall_score = train_recall_score csv_dict.train_f1_score = train_f1_score csv_dict.train_roc_auc_score1 = train_roc_auc_score1 w.writerow(csv_dict) f.close() client.delete(file_csv_path + 'evaluation.csv') client.upload(file_csv_path + 'evaluation.csv', 'evaluation.csv') # 保存测试集预测结果文件 file_csv_path = output_path + '/result/data/' # 字典中的key值即为csv中列名 dataframe = pd.DataFrame({target: result}) # 将DataFrame存储为csv,index表示是否显示行名,default=True dataframe.to_csv("result.csv", index=False, sep=',') client.delete(file_csv_path + 'result.csv') client.upload(file_csv_path + 'result.csv', 'result.csv')
def conn(self): client = Client('http://192.168.0.107:11070') return client