Beispiel #1
0
def download(hdfs_path, local_path):
    # 初始化HDFS连接
    client = Client('http://fisher.lazybone.xyz:9001', root='/')
    if os.path.exists(local_path):
        print('文件已存在')
        return
    client.download(hdfs_path=hdfs_path, local_path=local_path)
Beispiel #2
0
def get_data():
    client = Client("http://t3.dev:50070", "hadoop")
    # client = InsecureClient(url="http://t3.dev:50070", user="******", root="/")
    print(client.list("/huiqu/common/area.txt"))
    with client.read("/huiqu/common/area.txt/part-00000") as read:
        # print(read.read().decode('utf8'))
        return {"data": read.read().decode('utf8')}
Beispiel #3
0
def save_to_storage(filename, sha1):
    if args.no_save:
        return False
    from hdfs import Client

    client = Client('http://{}:50070'.format(config.get("hdfs", "host")), root=config.get("reportparser", "storage"))
    uploaded = client.upload(sha1, filename, overwrite=True)
    return uploaded
def main():
    client = Client("http://127.0.0.1:50070",
                    root="/",
                    timeout=100,
                    session=False)
    #client.makedirs("/news")
    client.upload("/input", "x.html")
    print(client.list("/"))
Beispiel #5
0
 def get(self, request):
     client = Client(HDFS_HOST)
     hdfs = request.GET.get('hdfs')
     file_name = DataSource.objects.get(format_filename=hdfs).file_name
     client.download('/datahoop/' + hdfs, settings.MEDIA_ROOT + 'hdfs_download')
     path = os.path.join(settings.MEDIA_ROOT, 'hdfs_download')
     file = open(os.path.join(path, hdfs), 'rb')
     response = FileResponse(file)
     response = HttpResponse(content_type='application/vnd.ms-csv')
     response['Content-Disposition'] = 'attachment; filename={}.csv'.format(file_name.split('.')[0])
     return (response)
Beispiel #6
0
class BiliaprioriPipeline(object):
    def __init__(self):
        self.client = Client("http://fantome:50070")

    def process_item(self, item, spider):
        print("Get Tag", "  tag:", item['tagInfo'])

        #持续写入
        self.client.write('/bili_3-7day/tagInfo.txt',
                          item['tagInfo'] + "\n",
                          overwrite=False,
                          append=True,
                          encoding="utf-8")
Beispiel #7
0
    def upload(name, file_path, config):
        env_prefix = config.get("prefix", None)
        hdfs_client = Client(url=config["hdfs"]["name_node"])
        hdfs_hosts = []
        hdfs_http_host = config["hdfs"]["name_node"]
        hdfs_hosts.append(hdfs_http_host.replace("http://", ""))
        hdfs_data_service_root = "/data_service"
        if env_prefix is not None:
            hdfs_data_service_root = "/{0}_data_service".format(env_prefix)

        hdfs_client.makedirs(hdfs_data_service_root)
        timestamp = int(round(time.time() * 1000))
        target_file_name = "{2}/{0}/{1}/{0}_{1}.py".format(
            name, str(timestamp), hdfs_data_service_root)
        hdfs_client.makedirs("{2}/{0}/{1}".format(name, str(timestamp),
                                                  hdfs_data_service_root))
        print("hdfs file name: {0}".format(target_file_name))
        hdfs_client.upload(target_file_name, file_path)
        zip_path = os.path.join(
            os.path.dirname(os.path.dirname(os.path.dirname(__file__))),
            "joowing.zip")
        target_zp_file_name = "{2}/{0}/{1}/joowing.zip".format(
            name, str(timestamp), hdfs_data_service_root)
        # hdfs_client.upload(target_zp_file_name, zip_path)
        # return target_file_name, target_zp_file_name
        return target_file_name
Beispiel #8
0
 def __init__(self, **kwargs):
     self.table_cols_map = {}  # 表字段顺序 {table:(cols, col_default)}
     self.bizdate = bizdate  # 业务日期为启动爬虫的日期
     self.buckets_map = {}  # 桶 {table:items}
     self.bucketsize = kwargs.get('BUCKETSIZE')
     self.client = Client(kwargs.get('HDFS_URLS'))
     self.dir = kwargs.get('HDFS_FOLDER')  # 文件夹路径
     self.delimiter = kwargs.get('HDFS_DELIMITER')  # 列分隔符,默认 hive默认分隔符
     self.encoding = kwargs.get('HDFS_ENCODING')  # 文件编码,默认 'utf-8'
     self.hive_host = kwargs.get('HIVE_HOST')
     self.hive_port = kwargs.get('HIVE_PORT')
     self.hive_dbname = kwargs.get('HIVE_DBNAME')  # 数据库名称
     self.hive_auto_create = kwargs.get('HIVE_AUTO_CREATE',
                                        False)  # hive 是否自动建表,默认 False
     self.client.makedirs(self.dir)
Beispiel #9
0
    def __init__(self, config, expiration=30000):
        def get_token(username, password, expiration):
            ###
            # input_type: str, str, int
            # input: the username of PAI, the password of PAI and the expiration time of the token
            # output_type: str
            # output: token
            # Get the token from rest server API
            ###
            token_ready = False
            loop_count = 0
            while not token_ready:
                time.sleep(loop_count)
                loop_count += 1
                http_object = self.http.request(
                    'POST',
                    self.rest_server_url + 'token',
                    headers={
                        'Content-Type': 'application/x-www-form-urlencoded',
                    },
                    body='username='******'&password='******'&expiration=' + str(expiration))
                if http_object.status == 200:
                    token_ready = True
                    return json.loads(
                        http_object.data.decode('utf-8'))['token']
                else:
                    print(http_object.status, http_object.data)

        self.rest_server_url = config.rest_server_url  # rest server url
        self.http = urllib3.PoolManager()  # urllib3 http
        self.token = get_token(config.PAI_username, config.PAI_password,
                               expiration)  # rest Server token
        self.hdfs_client = Client(config.webhdfs_url)  # hdfs web url
Beispiel #10
0
    def reflect(self, reload_flag = False):
        if not self.ds_dict:
            si_app.logger.error('error: ds_dict must be provided.')
            return

        if reload_flag:
            si_app.delete_doc_from_index_by_datasource(ds_name=self.ds_dict['ds_name'])


        tclient = Client( self.ds_dict['ds_param']['hdfs_web_url'])     # self.ds_dict['ds_url'])
        path_hdfs = self.ds_dict['ds_param']['root_path']      # self.ds_dict['table_group_name']

        if path_hdfs[-1] == '/':
            # remove trailing '/' for concatenating more path
            path_hdfs = path_hdfs[:-1] # remove the trailing '/'

        filelist = self.getHDFSFileInfo(tclient, path_hdfs)


        for fd in filelist:
            si_app.add_table_content_index(ds_name = self.ds_dict['ds_name'],
                                           table_id=fd['table_name'],
                                           table_info=(json.dumps(fd) ),
                                           table_content_index = ' '.join([fd[k] for k in fd.keys() ])
                                           )
Beispiel #11
0
def generate_job_and_outputtable(schedules_list):
    job_list = []
    outtable_list = []
    client = Client("http://emr2-header-1.ipa.aidigger.com:50070", timeout=30)
    for work_id, schedule_id, cron_type in schedules_list:
        schedule_url = "https://pony.aidigger.com/api/v1/schedules/{}".format(
            schedule_id)
        job_infos = requests.get(schedule_url).json()["data"]
        time.sleep(1)
        owner = job_infos["owner"]
        print("schedule_url: " + schedule_url)
        for job_info in job_infos["execute_DAG"]:
            try:
                if job_info.get("job_info") \
                    and job_info["job_info"]["configs"].get("command","") \
                    and job_info["job_info"]["configs"]["command"].startswith("data_pipeline") : #or job_info["job_info"]["configs"]["command"].startswith("data_connector")
                    config = job_info["job_info"]["configs"]
                    if "args" in config.keys():
                        config['args']["isstreaming"] = str(
                            config['args']["isStreaming"] if "isStreaming" in
                            config['args'].keys(
                            ) else config['args']["isstreaming"])
                        config['args'].get("spark_conf", {}).get(
                            "dependency", {}).pop("data_pipeline", None)
                        config['args'].pop("KafkaCheckpoint", None)
                    job_list.append(
                        (work_id, config["job_id"], job_info["name"],
                         job_info["job_info"]["configs"]["command"], "1G",
                         "0.3", owner, cron_type))
                    for output in config["output"]:
                        outtable_list.append(deepcopy(output))
                        dayu_fullnames = output["dayu_fullname"].split(":")
                        if not dayu_fullnames:
                            raise Exception("error!!")
                        if dayu_fullnames[0].lower() == "hive":
                            dayu_fullnames[1] = "dayu_temp"
                            output["dayu_full_name"] = ":".join(
                                dayu_fullnames) + "_k8spre"
                        elif dayu_fullnames[0].lower().startswith("oss"):
                            output["dayu_full_name"] = output[
                                "dayu_fullname"][:-1] + "_k8spre/"
                        elif dayu_fullnames[0].lower().startswith("kafka"):
                            output["dayu_full_name"] = output[
                                "dayu_fullname"] + "_k8spre"
                            output["dayu_full_name"] = output[
                                "dayu_full_name"].replace(".", "_")
                        else:
                            output["dayu_full_name"] = output[
                                "dayu_fullname"] + "_k8spre"
                        output.pop("dayu_id")
                    content = json.dumps(config).encode(encoding='utf-8')
                    client.write("/tmp/ting.wu/k8s_press/{}.json".format(
                        config["job_id"]),
                                 overwrite=True,
                                 data=content)
                    print("  hdfs: /tmp/ting.wu/k8s_press/{}.json".format(
                        config["job_id"]))
            except Exception as err:
                print(err)
    return job_list, outtable_list
Beispiel #12
0
 def __init__(self, spark, config, generator):
     super(DataContext, self).__init__()
     self.spark = spark
     self.config = config
     self.generator = generator
     self.env_prefix = config.get("prefix", None)
     self.hdfs_client = Client(url=";".join(config["hdfs"]["name_node"]),
                               proxy="joowing")
    def from_settings(cls, settings):
        hdfs_master = settings['HDFS_MASTER']
        hdfs_address = settings['HDFS_ADDRESS']
        try:
            client = Client('http://' + str(hdfs_master) + ':' +
                            str(hdfs_address))
        except Exception as e:
            print(e)

        return cls(client)
Beispiel #14
0
    def connect_and_login(self, **kwargs):
        import requests

        host = None
        port = None
        user = None
        password = None
        root = None
        timeout = None
        proxy = None

        if 'host' in kwargs:
            host = kwargs['host']
        if 'port' in kwargs:
            port = kwargs['port']
        if 'kdc' in kwargs:
            kdc = kwargs['kdc']
        if 'user' in kwargs:
            user = kwargs['user']
        if 'password' in kwargs:
            password = kwargs['password']
        if 'root' in kwargs:
            root = kwargs['root']
        if 'proxy' in kwargs:
            proxy = kwargs['proxy']
        if 'timeout' in kwargs:
            timeout = kwargs['timeout']

        self.session = requests.Session()
        adapter = requests.adapters.HTTPAdapter(pool_maxsize=0)
        self.session.mount('http://',  adapter)
        self.session.mount('https://', adapter)
        self.session.headers.update({'Connection':'Keep-Alive'})

        self.connectionStatus = False
        try:
            timeout = int(timeout)
            url = "http://" + host + ":" + str(port)

            hdfsLogin = WebHDFS(url, kdc)
            cookieStr = hdfsLogin.authenticate(user, password)
            if cookieStr != None:
                cookieList = cookieStr.split('=', 1)
                cookieDict = {cookieList[0]: cookieList[1]}
                requests.utils.add_dict_to_cookiejar(self.session.cookies, cookieDict)

            self.client = Client(url, root=root, proxy=proxy, timeout=timeout, session=self.session)
        except HdfsError as hdfsError:
            raise HdfsLibraryError(str(hdfsError))
        except Exception as exception:
            raise HdfsLibraryError(str(exception))

        self.connectionStatus = True
        return self.client
Beispiel #15
0
    def __init__(self, config: dict = None, file: str = 'openpai.json'):
        """config should contain
            - rest_server_socket
            - hdfs_web_socket
            - user
            - password
        """
        if config is None:
            with open(file) as fn:
                config = json.load(fn)
        for key in [
                'rest_server_socket', 'hdfs_web_socket', 'user', 'password'
        ]:
            assert key in config, '%s is not defined for OpenPAI' % (key)
        for key in ['rest_server_socket', 'hdfs_web_socket']:
            assert config[key].startswith(
                'http://'), '%s should have http prefix' % (key)

        self.rest_server_socket = config['rest_server_socket']
        self.hdfs_client = Client(config['hdfs_web_socket'])
        self.config = config
Beispiel #16
0
class HdfsClient:
    """基于hdfs library实现的hdfs客户端
	"""
    def __init__(self, host, port=50070):
        self.url = "http://%s:%d" % (host, port)
        self.client = Client(url=self.url)

    def isExists(self, hdfs_path):
        try:
            status = self.client.acl_status(hdfs_path, strict=False)
            if status != None:
                info = "file or directory %s is existed." % hdfs_path
                return (0, info)
            else:
                info = "file or directory %s not existed." % hdfs_path
                return (1, info)
        except Exception, e:
            info = "HDFS isExists:{}".format(str(e))
            return (2, info)
Beispiel #17
0
    def __init__(self, config, expiration=30000):
        def get_token(username, password, expiration):
            ###
            # input_type: str, str, int
            # input: the username of PAI, the password of PAI and the expiration time of the token
            # output_type: str
            # output: token
            # Get the token from rest server API
            ###
            rest_server_url_without_namespace = '/'.join(
                self.rest_server_url.split('/')[:-3]) + '/'
            token_ready = False
            loop_count = 0
            while not token_ready:
                time.sleep(loop_count)
                loop_count += 1
                http_object = self.http.request(
                    'POST',
                    rest_server_url_without_namespace + 'token',
                    headers={
                        'Content-Type': 'application/json',
                    },
                    body=json.dumps({
                        'username': username,
                        'password': password,
                        'expiration': str(expiration)
                    }))
                if http_object.status == 200:
                    token_ready = True
                    return json.loads(
                        http_object.data.decode('utf-8'))['token']
                else:
                    print(http_object.status, http_object.data)

        self.rest_server_url = config.rest_server_url  # rest server url
        self.http = urllib3.PoolManager()  # urllib3 http
        self.token = get_token(config.PAI_username, config.PAI_password,
                               expiration)  # rest Server token
        self.hdfs_client = Client(config.webhdfs_url)  # hdfs web url
Beispiel #18
0
    def get(self, request, *args, **kwargs):
        table_name = request.data.get('name')
        username = request.session['username']
        password = request.session['password']
        host = request.session['host']
        port = request.session['port']
        database_name = request.session['dbdatabase_name']
        obj = DataSource.objects
        conn = pymssql.connect(database=database_name, user=username, password=password, host=host, port=port)
        client = Client(HDFS_HOST)
        cur = conn.cursor()
        for i in table_name:
            global rels
            cur.execute("select name from syscolumns where id = object_id('%s');" % (i))
            rels = []
            rel = []
            rows = cur.fetchall()
            for i in rows:
                for item in i:
                    rel.append(item)
            rels.append(rel)
            # 类似于其他语言的 query 函数, execute 是 python 中的执行查询函数
            cur.execute("SELECT * FROM  %s" % (i))
            # 使用 fetchall 函数,将结果集(多维元组)存入 rows 里面
            rows1 = cur.fetchall()
            # 依次遍历结果集,发现每个元素,就是表中的一条记录,用一个元组来显示
            for row in rows1:
                rels.append(list(row))
            file_name = i + '.sql'
            format_name = uuid.uuid1()
            filepath = settings.MEDIA_ROOT + format_name
            with open(filepath, 'wb+') as writer:
                for chunk in rels:
                    writer.write(chunk)

            client.upload("/datahoop", filepath)
            obj.create(file_name=file_name, format_name=format_name, user_id=1)
            os.remove(filepath)
        client.close()
        cur.close()
        conn.close()

        return HttpResponse(json.dumps(rels), content_type='application/json')
Beispiel #19
0
    def delete(self, request, *args, **kwargs):

        file_id = request.data.get('file_id')
        where = DataSource.objects.get(id=file_id).where
        if where == 'hdfs':
            file = DataSource.objects.get(id=file_id)
            hdfs_name = DataSource.objects.get(id=file_id).format_filename
            client = Client(HDFS_HOST)
            client.delete('/datahoop/' + hdfs_name, recursive=True)
            file.delete()
        else:
            client = pymongo.MongoClient(settings.MONGO_DB_HOST, settings.MONGO_DB_PORT)
            db = client.datahoop.data
            file_id = DataSource.objects.filter(id=id).first()
            obj_id = file_id.obj_id
            file_id.delete()
            db.remove({"_id": ObjectId(obj_id)})
            client.close()
        return HttpResponse(content_type='application/json')
Beispiel #20
0
    def get(self, request, *args, **kwargs):
        # table_name = request.data.get('name')
        table_name = 'files_datasource'
        username = request.session['username']
        password = request.session['password']
        host = request.session['host']
        port = request.session['port']
        database_name = request.session['database_name']
        obj = DataSource.objects
        con = pymysql.connect(host, username, password, database_name)
        client = Client(HDFS_HOST)
        cur = con.cursor()
        # for i in table_name:
        sql = "select DISTINCT (COLUMN_NAME) from information_schema.COLUMNS where table_name = '%s'"
        cur.execute(sql % (table_name))
        rows = cur.fetchall()
        rels = []
        rel = []
        for i in rows:
            rel.append(i[0])
        rels.append(rel)  # 类似于其他语言的 query 函数, execute 是 python 中的执行查询函数
        cur.execute("SELECT * FROM  %s" % (table_name))  # 使用 fetchall 函数,将结果集(多维元组)存入 rows 里面
        rows = cur.fetchall()  # 依次遍历结果集,发现每个元素,就是表中的一条记录,用一个元组来显示
        for row in rows:
            rels.append(list(row))
        file_name = table_name + '.sql'
        format_name = uuid.uuid1()
        filepath = settings.MEDIA_ROOT + str(format_name)
        with open(filepath, 'wb+') as writer:
            for chunk in rels:
                writer.write(chunk)

        client.upload("/datahoop", filepath)
        obj.create(file_name=file_name, format_name=format_name, user_id=1)
        os.remove(filepath)
        client.close()
        con.close()
        cur.close()

        return HttpResponse(json.dumps(rels), content_type='application/json')
Beispiel #21
0
 def get(self, request):  # delete mydata
     file_id = request.GET.get('file_id')
     try:
         where = DataSource.objects.get(id=file_id).where
         print(DataSource.objects.get(id=file_id))
         print(where)
         format_filename = DataSource.objects.get(
             id=file_id).format_filename
         format_name_count = DataSource.objects.filter(
             format_filename=format_filename).count()
         if where == 'hdfs' and format_name_count == 1:
             file = DataSource.objects.get(id=file_id)
             hdfs_name = DataSource.objects.get(id=file_id).format_filename
             client = Client(HDFS_HOST)
             client.delete('/datahoop/' + hdfs_name, recursive=True)
             file.delete()
             item = Collect.objects.filter(file_id=file_id)
             if item:
                 item.delete()
         elif where == 'hdfs' and format_name_count > 1:
             file = DataSource.objects.get(id=file_id)
             file.delete()
             item = Collect.objects.filter(file_id=file_id)
             if item:
                 item.delete()
         else:
             client = pymongo.MongoClient(settings.MONGO_DB_URI)
             db = client.datahoop.data
             data_obj = DataSource.objects.filter(id=file_id).first()
             obj_id = data_obj.obj_id
             data_obj.delete()
             db.remove({"_id": ObjectId(obj_id)})
             client.close()
             item = Collect.objects.filter(file_id=file_id)
             if item:
                 item.delete()
         return JsonResponse({'status': True})
     except:
         return JsonResponse({'status': False})
Beispiel #22
0
 def build_connection(self):
     self.client = Client(self.hadoop_url)
Beispiel #23
0
from hdfs import Client

client = Client("http://master:9870")
#client.makedirs("/abc/xyz")
x = client.list("/")
y = client.list("/", status=True)
Beispiel #24
0
class HdfsPipeline(object):
    def __init__(self, **kwargs):
        self.table_cols_map = {}  # 表字段顺序 {table:(cols, col_default)}
        self.bizdate = bizdate  # 业务日期为启动爬虫的日期
        self.buckets_map = {}  # 桶 {table:items}
        self.bucketsize = kwargs.get('BUCKETSIZE')
        self.client = Client(kwargs.get('HDFS_URLS'))
        self.dir = kwargs.get('HDFS_FOLDER')  # 文件夹路径
        self.delimiter = kwargs.get('HDFS_DELIMITER')  # 列分隔符,默认 hive默认分隔符
        self.encoding = kwargs.get('HDFS_ENCODING')  # 文件编码,默认 'utf-8'
        self.hive_host = kwargs.get('HIVE_HOST')
        self.hive_port = kwargs.get('HIVE_PORT')
        self.hive_dbname = kwargs.get('HIVE_DBNAME')  # 数据库名称
        self.hive_auto_create = kwargs.get('HIVE_AUTO_CREATE',
                                           False)  # hive 是否自动建表,默认 False
        self.client.makedirs(self.dir)

    @classmethod
    def from_crawler(cls, crawler):
        settings = crawler.settings
        return cls(**settings)

    def process_item(self, item, spider):
        """
        :param item:
        :param spider:
        :return: 数据分表入库
        """
        if item.tablename in self.buckets_map:
            self.buckets_map[item.tablename].append(item)
        else:
            cols, col_default = [], {}
            for field, value in item.fields.items():
                cols.append(field)
                col_default[field] = item.fields[field].get('default', '')
            cols.sort(key=lambda x: item.fields[x].get('idx', 1))
            self.table_cols_map.setdefault(
                item.tablename, (cols, col_default))  # 定义表结构、字段顺序、默认值
            self.buckets_map.setdefault(item.tablename, [item])
            if self.hive_auto_create:
                self.checktable(item.tablename, cols)  # 建表
        self.buckets2db(bucketsize=self.bucketsize,
                        spider_name=spider.name)  # 将满足条件的桶 入库
        return item

    def close_spider(self, spider):
        """
        :param spider:
        :return:  爬虫结束时,将桶里面剩下的数据 入库
        """
        self.buckets2db(bucketsize=1, spider_name=spider.name)

    def checktable(self, tbname, cols):
        """
        :return: 创建 hive 表
        """
        hive = CtrlHive(self.hive_host, self.hive_port, self.hive_dbname)
        cols = ['keyid'] + cols + ['bizdate', 'ctime', 'spider']
        create_sql = f"create table if not exists {tbname}({' string,'.join(cols)} string)"
        hive.execute(create_sql)
        logger.info(f"表创建成功 <= 表名:{tbname}")

    def buckets2db(self, bucketsize=100, spider_name=''):
        """
        :param bucketsize:  桶大小
        :param spider_name:  爬虫名字
        :return: 遍历每个桶,将满足条件的桶,入库并清空桶
        """
        for tablename, items in self.buckets_map.items(
        ):  # 遍历每个桶,将满足条件的桶,入库并清空桶
            if len(items) >= bucketsize:
                new_items = []
                cols, col_default = self.table_cols_map.get(tablename)
                for item in items:
                    keyid = rowkey()
                    new_item = {'keyid': keyid}
                    for field in cols:
                        value = item.get(field, col_default.get(field))
                        new_item[field] = str(value).replace(
                            self.delimiter, '').replace('\n', '')
                    new_item['bizdate'] = self.bizdate  # 增加非业务字段
                    new_item['ctime'] = time.strftime("%Y-%m-%d %H:%M:%S",
                                                      time.localtime())
                    new_item['spider'] = spider_name
                    value = self.delimiter.join(new_item.values())
                    new_items.append(value)

                # 每张表是都是一个文件夹
                folder = f"{self.dir}/{tablename}"
                self.client.makedirs(folder)

                filename = f"{folder}/data.txt"
                info = self.client.status(filename, strict=False)
                if not info:
                    self.client.write(filename,
                                      data='',
                                      overwrite=True,
                                      encoding=self.encoding)

                try:
                    content = '\n'.join(new_items) + '\n'
                    self.client.write(filename,
                                      data=content,
                                      overwrite=False,
                                      append=True,
                                      encoding=self.encoding)
                    logger.info(f"保存成功 <= 文件名:{filename} 记录数:{len(items)}")
                    items.clear()  # 清空桶
                except Exception as e:
                    logger.error(f"保存失败 <= 文件名:{filename} 错误原因:{e}")
Beispiel #25
0
#daoru
from hdfs import Client

client = Client("http://master:9870")
# client.makedirs("/abc/xyz")
x = client.list("/")
y = client.list("/", status=True)
print(y[1][0])
print(y[1][1]["accessTime"])

client.upload("/abc", "HDFSDao.py")
client.download("/abc/HDFSDao.py", "d:/ttt.py")

print("end___")
Beispiel #26
0
 def __init__(self, host, port=50070):
     self.url = "http://%s:%d" % (host, port)
     self.client = Client(url=self.url)
Beispiel #27
0
class RF_HDFS(object):
    def __init__(self):
        self.client = None
        self.directory = None

    def connect_and_login(self, **kwargs):
        import requests

        host = None
        port = None
        user = None
        password = None
        root = None
        timeout = None
        proxy = None

        if 'host' in kwargs:
            host = kwargs['host']
        if 'port' in kwargs:
            port = kwargs['port']
        if 'kdc' in kwargs:
            kdc = kwargs['kdc']
        if 'user' in kwargs:
            user = kwargs['user']
        if 'password' in kwargs:
            password = kwargs['password']
        if 'root' in kwargs:
            root = kwargs['root']
        if 'proxy' in kwargs:
            proxy = kwargs['proxy']
        if 'timeout' in kwargs:
            timeout = kwargs['timeout']

        self.session = requests.Session()
        adapter = requests.adapters.HTTPAdapter(pool_maxsize=0)
        self.session.mount('http://',  adapter)
        self.session.mount('https://', adapter)
        self.session.headers.update({'Connection':'Keep-Alive'})

        self.connectionStatus = False
        try:
            timeout = int(timeout)
            url = "http://" + host + ":" + str(port)

            hdfsLogin = WebHDFS(url, kdc)
            cookieStr = hdfsLogin.authenticate(user, password)
            if cookieStr != None:
                cookieList = cookieStr.split('=', 1)
                cookieDict = {cookieList[0]: cookieList[1]}
                requests.utils.add_dict_to_cookiejar(self.session.cookies, cookieDict)

            self.client = Client(url, root=root, proxy=proxy, timeout=timeout, session=self.session)
        except HdfsError as hdfsError:
            raise HdfsLibraryError(str(hdfsError))
        except Exception as exception:
            raise HdfsLibraryError(str(exception))

        self.connectionStatus = True
        return self.client

    def checkConnectionStatus(self):
        return self.connectionStatus

    def list_dir(self, directory):
        output = []
        try:
            if directory != None:
                output = self.client.list(directory, status=True)
            else:
                output = self.client.list(self.client.root, status=True)
        except HdfsError as hdfsError:
            raise HdfsLibraryError(str(hdfsError))
        except Exception as exception:
            raise HdfsLibraryError(str(exception))
        return output

    def list_names(self, directory):
        output = []
        try:
            if directory != None:
                output = self.client.list(directory, status=False)
            else:
                output = self.client.list(self.client.root, status=False)
        except HdfsError as hdfsError:
            raise HdfsLibraryError(str(hdfsError))
        except Exception as exception:
            raise HdfsLibraryError(str(exception))
        return output

    def upload(self, remote_path, local_path, overwrite=False, permission=None):
        output = None
        try:
            output = self.client.upload(remote_path, local_path, overwrite, permission=permission)
        except HdfsError as hdfsError:
            # For some reason this exception includes the entire stack trace after
            # the error message, so split on '\n' and only return the first line.
            error = str(hdfsError).splitlines()[0]
            raise HdfsLibraryError(error)
        except Exception as exception:
            raise HdfsLibraryError(str(exception))
        return output

    def download(self, remote_path, local_path, overwrite=False):
        output = None
        try:
            output = self.client.download(remote_path, local_path, overwrite)
        except HdfsError as hdfsError:
            raise HdfsLibraryError(str(hdfsError))
        except Exception as exception:
            raise HdfsLibraryError(str(exception))
        return output

    def mkdir(self, directory, permission):
        try:
            # no return value
            self.client.makedirs(directory, permission=permission)
        except HdfsError as hdfsError:
            raise HdfsLibraryError(str(hdfsError))
        except Exception as exception:
            raise HdfsLibraryError(str(exception))

    def rmdir(self, directory):
        try:
            # no return value
            if self.client.delete(directory, recursive=True) == False:
                raise HdfsLibraryError("Directory does not exist: %r", directory)
        except HdfsError as hdfsError:
            raise HdfsLibraryError(str(hdfsError))
        except Exception as exception:
            raise HdfsLibraryError(str(exception))

    def rename(self, src_file, dst_file):
        try:
            # no return value
            self.client.rename(src_file, dst_file)
        except HdfsError as hdfsError:
            raise HdfsLibraryError(str(hdfsError))
        except Exception as exception:
            raise HdfsLibraryError(str(exception))

    def delete(self, file):
        try:
            # no return value
            if self.client.delete(file) == False:
                raise HdfsLibraryError("File does not exist: %r", file)
        except HdfsError as hdfsError:
            raise HdfsLibraryError(str(hdfsError))
        except Exception as exception:
            raise HdfsLibraryError(str(exception))

    def set_time(self, file, mod_time):
        try:
            # no return value
            self.client.set_times(file, -1, mod_time)
        except HdfsError as hdfsError:
            raise HdfsLibraryError(str(hdfsError))
        except Exception as exception:
            raise HdfsLibraryError(str(exception))

    def set_owner(self, file, owner, group):
        try:
            # no return value
            self.client.set_owner(file, owner=owner, group=group)
        except HdfsError as hdfsError:
            raise HdfsLibraryError(str(hdfsError))
        except Exception as exception:
            raise HdfsLibraryError(str(exception))

    def set_permission(self, file, permission):
        try:
            # no return value
            self.client.set_permission(file, permission=permission)
        except HdfsError as hdfsError:
            raise HdfsLibraryError(str(hdfsError))
        except Exception as exception:
            raise HdfsLibraryError(str(exception))

    def set_acl(self, file, aclspec):
        try:
            # no return value
            self.client.set_acl(file, aclspec=aclspec)
        except HdfsError as hdfsError:
            raise HdfsLibraryError(str(hdfsError))
        except Exception as exception:
            raise HdfsLibraryError(str(exception))

    def status(self, path):
        output = ''
        try:
            output = self.client.status(path)
        except HdfsError as hdfsError:
            raise HdfsLibraryError(str(hdfsError))
        except Exception as exception:
            raise HdfsLibraryError(str(exception))
        return output

    def checksum(self, path):
        output = ''
        try:
            output = self.client.checksum(path)
        except HdfsError as hdfsError:
            raise HdfsLibraryError(str(hdfsError))
        except Exception as exception:
            raise HdfsLibraryError(str(exception))
        return output

    def close(self):
        self.session.close()
Beispiel #28
0
 def __init__(self):
     self.client = Client("http://fantome:50070")
Beispiel #29
0
import pandas as pd
import os
from hdfs import Client
# 目前读取hdfs文件采用方式:
# 1. 先从hdfs读取二进制数据流文件
# 2. 将二进制文件另存为.csv
# 3. 使用pandas读取csv文件
HDFSHOST = "http://172.16.18.112:50070"
train_path = "/9a7e7ef5a78f4f8388deff28cc5c2115/dataSet/devdatasetdef19761"
test_path = "/9a7e7ef5a78f4f8388deff28cc5c2115/dataSet/devdatasetdef19762"
train_FILENAME = train_path + "/data/Data.csv"  #hdfs文件路径
test_FILENAME = test_path + "/data/Data.csv"  #hdfs文件路径
client = Client(HDFSHOST)
with client.read(train_FILENAME) as tr_s:
    tr_content = tr_s.read()
    tr_s = str(tr_content, 'utf-8')

# 确保文件写入完毕
tr_file = open("trainData.csv", "w")
tr_file.flush()
os.fsync(tr_file)
tr_file.write(tr_s)
tr_file.close()

# 读取文件
df_train = pd.read_csv("trainData.csv", header=0)
print(df_train)

with client.read(test_FILENAME) as te_fs:
    te_content = te_fs.read()
    te_s = str(te_content, 'utf-8')
Beispiel #30
0
            print(rowkey)
            mutations = [Mutation(column=self.columnFamily + ":ipaddr", value=ipaddr), \
                         Mutation(column=self.columnFamily + ":visitTime", value=visitTime), \
                         Mutation(column=self.columnFamily + ":user_id", value=user_id), \
                         Mutation(column=self.columnFamily + ":link", value=link)
                         ]
            # 一次提交多行
            mutations_batch.append(
                BatchMutation(row=rowkey, mutations=mutations))
            if len(mutations_batch) % batch_size == 0:
                self.client.mutateRows(self.tablename, mutations_batch)
                mutations_batch = []


if __name__ == "__main__":

    # 建立hbase连接
    hbasewriteer = CreateTableAndImportData('user_log_info', 'cf_log')
    hbasewriteer.createTable()

    # 连接HDFS
    client = Client(HDFSNN)

    # 获取文件列表
    logFiles = client.list(LOGPATH)

    # 读取文件
    for logfile in logFiles:
        with client.read(os.path.join(LOGPATH, logfile)) as deal_file_handle:
            hbasewriteer.importData(deal_file_handle)
Beispiel #31
0
class ChatBotModel(object):
    def __init__(self,
                 hadoop_url,
                 hdfs_index_file,
                 local_index_file,
                 corpus_dir,
                 unk_answer='',
                 max_answer_len=1024):
        self.hadoop_url = hadoop_url
        self.hdfs_index_file = hdfs_index_file
        self.local_index_file = local_index_file
        self.corpus_dir = corpus_dir
        self.max_answer_len = max_answer_len
        self.unk_answer = unk_answer
        self.client = None
        self.inverted_index = {}

    def build_connection(self):
        self.client = Client(self.hadoop_url)

    def fetch_index_file(self):
        self.client.download(hdfs_path=self.hdfs_index_file,
                             local_path=self.local_index_file,
                             overwrite=True)

    def load_inverted_index(self):
        with open(self.local_index_file, 'r', encoding='utf-8') as f:
            for line in f:
                word, *querys = line.strip().split('\t')
                for query in querys:
                    file_name, query_id, score = query.split(':')
                    if word in self.inverted_index:
                        self.inverted_index[word].append(
                            [file_name, int(query_id),
                             float(score)])
                    else:
                        self.inverted_index[word] = []
                        self.inverted_index[word].append(
                            [file_name, int(query_id),
                             float(score)])

    def prepare(self):
        self.build_connection()
        self.fetch_index_file()
        self.load_inverted_index()

    def read_corpus_answer(self, file_name, query_id):
        file_path = os.path.join(self.corpus_dir, file_name)
        file_status = self.client.status(file_path)
        if file_status['length'] <= query_id:
            return None
        with self.client.read(hdfs_path=file_path,
                              offset=query_id,
                              length=self.max_answer_len,
                              encoding='utf-8') as f:
            answer = f.read().strip().split('\n')[0]
            return answer

    def predict_answer(self, query):
        words = jieba.lcut_for_search(query)
        querys = {}
        for word in words:
            if word not in self.inverted_index:
                continue
            for file_name, query_id, score in self.inverted_index[word]:
                query = (file_name, query_id)
                if query in querys:
                    querys[query] += score
                else:
                    querys[query] = score
        if len(querys) == 0:
            return self.unk_answer
        best_query = max(querys.items(), key=lambda x: x[1])
        (best_file_name, best_query_id), best_score = best_query
        best_answer = self.read_corpus_answer(best_file_name, best_query_id)
        if best_answer is None:
            best_answer = self.unk_answer
        return best_answer
Beispiel #32
0
def train(train_path,
          test_path,
          output_path,
          target,
          train_split_ratio=0.33,
          penalty='l2',
          dual=False,
          tol=1e-4,
          C=1.0,
          random_state=None,
          multi_class='ovr'):
    # 设置起始时间
    time.localtime()
    time_trains_start = time.strftime('%Y{y}%m{m}%d{d} %H{h}%M{f}%S{s}'.format(
        y='/', m='/', d='', h=':', f=':', s=''))
    start_time = time.time()

    # 设置输入文件路径
    train_FILENAME = train_path + "/data/Data.csv"  # hdfs文件路径
    test_FILENAME = test_path + "/data/Data.csv"  # hdfs文件路径
    client = Client(HDFS_HOSTS1)
    # 训练数据读取
    with client.read(train_FILENAME) as tr_s:
        tr_content = tr_s.read()
        tr_s = str(tr_content, 'utf-8')
    # 确保文件写入完毕
    tr_file = open("trainData.csv", "w")
    tr_file.flush()
    os.fsync(tr_file)
    tr_file.write(tr_s)
    tr_file.close()
    df_train = pd.read_csv("trainData.csv", header=0)
    print(df_train)

    # 测试数据读取
    with client.read(test_FILENAME) as te_fs:
        te_content = te_fs.read()
        te_s = str(te_content, 'utf-8')
    # 确保文件写入完毕
    te_file = open("testData.csv", "w")
    te_file.flush()
    os.fsync(te_file)
    te_file.write(te_s)
    te_file.close()
    df_test = pd.read_csv("testData.csv", header=0)
    print(df_test)

    min_max_scaler = preprocessing.MinMaxScaler(feature_range=(-1, 1))
    test_data_num = df_train.shape[0]
    train_data_num = df_train.shape[0]

    # 处理预测集
    df_test = min_max_scaler.fit_transform(df_test)
    df_test = np.array(df_test)

    # 数据处理和清洗
    cols = [tmp_i for tmp_i in df_train.columns if tmp_i not in [target]]
    X = df_train[cols]

    X = np.array(X)
    X = min_max_scaler.fit_transform(X)
    Y = df_train[target]
    Y = np.array(Y)

    # 训练集数据分割
    X_train, X_test, Y_train, Y_test = train_test_split(
        X, Y, test_size=train_split_ratio)

    # 使用 scikit learn 中的LR模型进行训练
    clf = LogisticRegression(penalty,
                             dual,
                             tol,
                             C,
                             random_state,
                             multi_class,
                             solver='liblinear')
    clf.fit(X_train, Y_train)

    # 准确率train_acc
    train_acc = clf.score(X_test, Y_test)
    print('score Scikit learn: ', train_acc)
    # 精确率train_precision_score
    train_precision_score = precision_score(Y_test, clf.predict(X_test))
    # 召回率train_recall_score
    train_recall_score = recall_score(Y_test, clf.predict(X_test))
    # F1_Score
    train_f1_score = f1_score(Y_test, clf.predict(X_test))
    # roc_auc_score
    train_roc_auc_score1 = roc_auc_score(Y_test, clf.predict(X_test))

    # 使用 scikit learn 中的LR模型进行预测
    result = clf.predict(df_test)
    # print(result)

    # 设置终止时间,并计算总时间
    train_end = time.time()
    train_seconds = train_end - start_time
    m, s = divmod(train_seconds, 60)
    h, m = divmod(m, 60)
    time_trains_all = "%02d:%02d:%02d" % (h, m, s)

    # ++++++++++++++++++++++++++++++++++++++++训练结果保存+++++++++++++++++++++++++++++++++++++++#
    ## 保存摘要模型报告文件
    # abstract_path = HDFS_HOSTS1 + output_path + '/abstract/data/'
    abstract_path = output_path + '/abstract/data/'
    f = open('abstract.csv', mode='w', newline='')
    fileheader = [
        'FrameWork', 'Version', 'model', 'accuracy', 'time_trains_start',
        'time_trains_all', 'test_data_num', 'train_data_num'
    ]
    w = csv.DictWriter(f, fileheader)
    w.writeheader()
    csv_dict = edict()
    csv_dict.FrameWork = 'Scikit-learn'
    csv_dict.Version = sklearn.__version__
    csv_dict.model = '%s' % LogisticRegression
    csv_dict.accuracy = str(train_acc)
    csv_dict.time_trains_start = time_trains_start
    csv_dict.time_trains_all = time_trains_all
    csv_dict.test_data_num = str(test_data_num)
    csv_dict.train_data_num = str(train_data_num)
    w.writerow(csv_dict)
    f.close()
    client.delete(abstract_path + 'abstract.csv')
    client.upload(abstract_path + 'abstract.csv', 'abstract.csv')
    # if len(client.list(abstract_path)):
    # 	client.delete(abstract_path + 'abstract.csv')
    # 	client.upload(abstract_path + 'abstract.csv', 'abstract.csv')
    # else:
    # 	client.upload(abstract_path + 'abstract.csv', 'abstract.csv')

    ##保存模型版本信息csv文件
    version_path = output_path + '/msg/data/'
    f = open('msg.csv', mode='w', newline='')
    fileheader = [
        'accuracy', 'time_trains_start', 'time_trains_all', 'test_data_num',
        'train_data_num'
    ]
    w = csv.DictWriter(f, fileheader)
    w.writeheader()
    csv_dict = edict()
    csv_dict.accuracy = str(train_acc)
    csv_dict.time_trains_start = time_trains_start
    csv_dict.time_trains_all = time_trains_all
    csv_dict.test_data_num = str(test_data_num)
    csv_dict.train_data_num = str(train_data_num)
    w.writerow(csv_dict)
    f.close()
    client.delete(version_path + 'msg.csv')
    client.upload(version_path + 'msg.csv', 'msg.csv')

    ## 保存训练评价指标模型报告文件
    file_csv_path = output_path + '/evaluation/data/'
    f = open('evaluation.csv', mode='w', newline='')
    fileheader = [
        'accuracy', 'train_precision_score', 'train_recall_score',
        'train_f1_score', 'train_roc_auc_score1'
    ]
    w = csv.DictWriter(f, fileheader)
    w.writeheader()
    csv_dict = edict()
    csv_dict.accuracy = str(train_acc)
    csv_dict.train_precision_score = train_precision_score
    csv_dict.train_recall_score = train_recall_score
    csv_dict.train_f1_score = train_f1_score
    csv_dict.train_roc_auc_score1 = train_roc_auc_score1
    w.writerow(csv_dict)
    f.close()
    client.delete(file_csv_path + 'evaluation.csv')
    client.upload(file_csv_path + 'evaluation.csv', 'evaluation.csv')

    # 保存测试集预测结果文件
    file_csv_path = output_path + '/result/data/'

    # 字典中的key值即为csv中列名
    dataframe = pd.DataFrame({target: result})
    # 将DataFrame存储为csv,index表示是否显示行名,default=True
    dataframe.to_csv("result.csv", index=False, sep=',')

    client.delete(file_csv_path + 'result.csv')
    client.upload(file_csv_path + 'result.csv', 'result.csv')
Beispiel #33
0
    def conn(self):
        client = Client('http://192.168.0.107:11070')

        return client