Example #1
0
def download_parquet_from_hdfs_dir(parquet_dir,
                                   local_dir,
                                   hdfs_ip,
                                   hdfs_port=50070):
    """
    从hdfs批量下载parquet文件到local_path
    :param parquet_dir: parquet文件所在的文件'/data/a.parquet'
    :param local_path: '/data_gen/b.parquet'
    :param hdfs_ip:
    :param hdfs_port:
    :return:
    """
    import os
    from hdfs.client import Client
    client = Client(f'http://{hdfs_ip}:{hdfs_port}')
    parquet_list = client.list(parquet_dir)
    print(parquet_list)
    for p in parquet_list:
        if p.endswith('.parquet'):
            print(f'downloading {os.path.join(parquet_dir, p)}')
            with client.read(os.path.join(parquet_dir, p)) as reader:
                data = reader.read()
            if not os.path.exists(local_dir):
                os.makedirs(local_dir)
            with open(os.path.join(local_dir, p), 'wb') as f:
                f.write(data)
Example #2
0
    def get(self, request):
        _hdfsName = request.GET.get("hdfsName",
                                    "46eccfa2-1c56-11e8-a752-1008b1983d21")
        _hdfsPath = os.path.join("/datahoop/", _hdfsName)
        # print(_hdfsPath)

        try:
            # 链接HDFS,读取文件
            cli = Client(settings.HDFS_HOST)
            fileName = cli.list(_hdfsPath)[1]
            # print("filename:", fileName)
            _hdfsPath = os.path.join(_hdfsPath + "/", fileName)
            # print(_hdfsPath)
            try:
                with cli.read(_hdfsPath, length=2000, encoding="gbk") as f:
                    datas = f.read()
            except UnicodeDecodeError:
                with cli.read(_hdfsPath, length=2000, encoding="utf8") as f:
                    datas = f.read()
            # 字符转list
            re.sub("\r\n", "\n", datas)
            logger.debug(datas)
            datas = datas.strip('"').split('\n')
            content = []
            for i in datas:
                content.append(i.strip('"').split(","))
        except HdfsError:
            return Response(data={"error": "文件未找到或文件编码格式不符合"},
                            status=status.HTTP_400_BAD_REQUEST)

        return Response(data={"data": content}, status=status.HTTP_200_OK)
def get_data(file_path):    
    HDFSUrl = "http://192.168.0.201:50070"
    client = Client(HDFSUrl, root='/')
    with client.read(file_path, buffer_size=1024, delimiter='\n', encoding='utf-8') as reader:
        data = [line.strip().split() for line in reader]
        print("data",data[0:2])
    return data
Example #4
0
    def get(self, request):
        _hdfsName = request.GET.get("hdfsName",
                                    "46eccfa2-1c56-11e8-a752-1008b1983d21")
        _hdfsPath = os.path.join("/datahoop/", _hdfsName)
        obj = DataSource.objects.get(format_filename=_hdfsName)
        # print(_hdfsPath)
        try:
            # 链接HDFS,读取文件
            cli = Client(settings.HDFS_HOST)
            try:
                with cli.read(_hdfsPath, encoding="gbk") as f:
                    datas = f.read()
            except UnicodeDecodeError:
                with cli.read(_hdfsPath, encoding="utf8") as f:
                    datas = f.read()
        except HdfsError:
            return Response(data={"error": "文件未找到或文件编码格式不符合"},
                            status=status.HTTP_400_BAD_REQUEST)

        response = HttpResponse(content_type='csv/plain')
        response['Content-Disposition'] = 'attachment; filename={0}'.format(
            obj.file_name)
        response.write(datas)

        return response
def test_hdfs_files():
    project = utils.get_test_project()
    head_ip = project.cluster.head.ip
    hdfs = Client('http://%s:50070' % head_ip)

    root_dirs = hdfs.list('/')
    assert 'spark' in root_dirs

    spark_dirs = hdfs.list('/spark')
    assert 'spark-1.4.1-bin-hadoop2.6.tgz' in spark_dirs
Example #6
0
def make_directory(hdfs_address, directory_path, directory_name):
    '''
        Description: This function helps users to create a directory in hdfs
        Parameters: -hdfs_address: hadoop master node ip address
                    -directory_path: the path the user want to create a directory
		    -directory_name: the directory name
        Returns: None
        '''
    client = Client('http://' + hdfs_address)
    client.makedirs(directory_path + directory_name)
Example #7
0
def dataframe_write_to_hdfs(hdfs_path, dataframe):
    """
    :param client:
    :param hdfs_path:
    :param dataframe:
    :return:
    """
    HDFSUrl = "http://192.168.0.201:50070"
    client = Client(HDFSUrl, root='/')    
    client.write(hdfs_path, dataframe.to_csv(header=False,index=False,sep="\t"), encoding='utf-8',overwrite=True)
def test_hdfs_files():
    project = utils.get_test_project()
    head_ip = project.cluster.head.ip
    hdfs = Client('http://%s:50070' % head_ip)

    root_dirs = hdfs.list('/')
    assert 'spark' in root_dirs

    spark_dirs = hdfs.list('/spark')
    assert 'spark-1.4.1-bin-hadoop2.6.tgz' in spark_dirs
Example #9
0
def put_to_hdfs(result_file):
    client = Client("http://192.168.53.30:50070")
    if client.status('/tmp/result.csv', strict=False):
        client.delete('/tmp/result.csv')
        client.upload('/tmp', result_file)
    else:
        client.upload('/tmp', result_file)
def test_hdfs_dirs():
    project = utils.get_test_project()
    head_ip = project.cluster.head.ip
    hdfs = Client('http://%s:50070' % head_ip)

    users_dirs = hdfs.list('/user')
    assert 'hive' in users_dirs
    assert 'impala' in users_dirs

    users_dirs = hdfs.list('/user/hive')
    assert 'warehouse' in users_dirs
def get_data_hdfs(file_path):
    HDFSUrl = "http://192.168.0.201:50070"
    client = Client(HDFSUrl, root='/')
    with client.read(file_path,
                     buffer_size=1024,
                     delimiter='\n',
                     encoding='utf-8') as reader:
        data = [line.strip().split(',') for line in reader]
        print("data", data[0:5])
    df = pd.DataFrame(data[1:], columns=data[0])
    return df
Example #12
0
def test_hdfs_dirs():
    project = utils.get_test_project()
    head_ip = project.cluster.head.ip
    hdfs = Client('http://%s:50070' % head_ip)

    users_dirs = hdfs.list('/user')
    assert 'hive' in users_dirs
    assert 'impala' in users_dirs

    users_dirs = hdfs.list('/user/hive')
    assert 'warehouse' in users_dirs
def save_page_hdfs(ipPort, file_path, contents):
    """保存网页源码到hdfs

    :param ipPort: hdfs连接地址
    :param file_path: 文件路径
    :param contents: 网页内容
    :return: None
    """
    client = Client(ipPort)
    with client.write(file_path) as writer:
        writer.write(bytes(contents, encoding='utf8'))
 def hdfs_file2points(path):
     client = Client(QuerierParallel.master_hdfs_path,
                     root="/",
                     timeout=100,
                     session=False)
     points = []
     with client.read(path) as f:
         for line in f:
             info = line.strip('\n').split('\t')
             points.append([float(info[0]), float(info[1])])
     f.close()
     return points
Example #15
0
def run_hdfs_test(conf: ConfigData):
    # the_date = conf.test_date()  # "20181101"
    client = Client(conf.hdfs_ip())  # "http://10.2.201.197:50070"
    # root_path = conf.unzip_dir(is_baoli)     # 'D:/DATA/UNZIP/'
    # dest_dir = conf.hdfs_dir_syb(is_baoli)

    # file_pre = conf.file_pre1()  # "t1_trxrecord_"
    # file_ext = conf.file_ext2()  # "_V2.csv"

    #    client.upload('/shouyinbao/', "/home/testFolder/logflow/bl_shouyinbao/UTF8/20181101/9999100000/t1_trxrecord_20181101_V2.csv", cleanup=True)
    dat = client.list('/', status=False)
    print(dat)
def do():
    global csv_path
    client = Client(hdfshost)
    file_list = client.list(csv_path)
    print(file_list)
    for file in file_list:
        if file.endswith(".csv"):
            csv_path = csv_path + file
    # 读取csv并同名写到本地
    with open("./异常临界值local.csv", 'w', encoding='GB2312') as local:
        with client.read(csv_path, encoding='GB2312') as hdfs:
            for line in hdfs:
                local.write(line.strip('\n'))
Example #17
0
def test_hdfs_dirs():
    project = utils.get_test_project()
    nn_ip = project.cluster.head.ip

    hdfs = Client("http://%s:50070" % nn_ip)
    assert hdfs

    root_dirs = hdfs.list("/")
    assert "tmp" in root_dirs
    assert "user" in root_dirs

    users_dirs = hdfs.list("/user")
    assert project.settings["USERNAME"] in users_dirs
Example #18
0
def test_hdfs_dirs():
    project = utils.get_test_project()
    nn_ip = project.cluster.head.ip

    hdfs = Client('http://%s:50070' % nn_ip)
    assert hdfs

    root_dirs = hdfs.list('/')
    assert 'tmp' in root_dirs
    assert 'user' in root_dirs

    users_dirs = hdfs.list('/user')
    assert project.settings['USERNAME'] in users_dirs
Example #19
0
 def read_accesslog_from_hdfs(self):
     # 实时日志流的存储是每5个点击数据存储一次
     client = Client("http://localhost:50070")
     file_names = client.list("/hadoop_file")
     ss = ""
     for file_name in file_names:
         with client.read("/hadoop_file/" + file_name,
                          encoding="utf-8") as reader:
             for line in reader:
                 # 去除测试数据
                 if line.startswith("filed1"):
                     continue
                 ss += line
 def generate_files(
     date,
     path="user/hadoop/trajectory/sim_trajectory_per_day/shanghai/%s-%s/%s",
 ):
     year, month, day = date.split('-')
     if date in QuerierParallel.files:
         return
     else:
         client = Client(QuerierParallel.master_hdfs_path,
                         root="/",
                         timeout=100,
                         session=False)
         QuerierParallel.files.update(
             {date: client.list(path % (year, month, day))})
Example #21
0
def read(dir_path, header):
    client = Client("http://127.0.0.1:50070")
    log_data = []
    for date_dir in client.list(dir_path):
        for log_file in client.list(dir_path+'/'+date_dir):
            with client.read(dir_path+'/'+date_dir+'/'+log_file) as fs:
                for line in fs:
                    row = line.strip().split('&')
                    if row != ['']:
                        tmp = []
                        for field in row:
                            tmp.append(field.split('=')[1])
                        log_data.append(tmp)
    return pd.DataFrame(log_data, columns=header)
def mv_local_to_hdfs():
    '''
    将写好的文件移动到hdfs
    '''
    now_time=time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(time.time()))
    file_index=int(now_time[11:13])
    if file_index==0:
        file_path_all=getYesterday()
    else:
        file_path_all=now_time[0:10]
    client=Client("http://master:50070")
    if file_path_all not in client.list('/traffFile'):
        os.system('/home/hadoop/hadoop-2.5.2/bin/hadoop fs -mkdir /traffFile/'+file_path_all)
    local_path='/usr/local/bro/spool/worker-1/extract_files/*'
    os.system('/home/hadoop/hadoop-2.5.2/bin/hadoop fs -put '+local_path+' /traffFile/'+file_path_all)
Example #23
0
def mv_local_to_hdfs(filename):
    '''
    将写好的文件移动到hdfs
    '''
    now_time=time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(time.time()))
    file_index=int(now_time[11:13])
    if file_index==0:
        file_path_all=getYesterday()
    else:
        file_path_all=now_time[0:10]
    client=Client("http://master:50070")
    if file_path_all not in client.list('/traffLog'):
        os.system('/home/hadoop/hadoop-2.5.2/bin/hadoop fs -mkdir /traffLog/'+file_path_all)
    local_path=get_path_or_buf(filename)
    os.system('/home/hadoop/hadoop-2.5.2/bin/hadoop fs -put '+local_path+' /traffLog/'+file_path_all)
Example #24
0
def read_corpus():
    qList = []
    # 问题的关键词列表
    qList_kw = []
    aList = []
    lines = []
    client = Client("http://localhost:50070")
    with client.read("/corpus/q_a.csv", encoding='utf-8') as reader:
        for line in reader:
            lines.append(line.strip())
    for t in lines:
        qList.append(t[0])
        qList_kw.append(seg.cut(t[0]))
        aList.append(t[1])
    return qList_kw, qList, aList
Example #25
0
def main():
  """Entry point."""
  # arguments parsing first for quicker feedback on invalid arguments
  args = docopt(__doc__, version=__version__)
  # set up logging
  logger = lg.getLogger('hdfs')
  logger.setLevel(lg.DEBUG)
  handler = Config().get_file_handler('hdfs')
  if handler:
    logger.addHandler(handler)
  # set up client and fix arguments
  client = Client.from_alias(args['--alias'])
  rpath = args['RPATH'] or '.'
  for option in ('--depth', '--threads'):
    try:
      args[option] = int(args[option])
    except ValueError:
      raise HdfsError('Invalid `%s` option: %r.', option, args[option])
  # run command
  if args['--log']:
    if handler:
      sys.stdout.write('%s\n' % (handler.baseFilename, ))
    else:
      raise HdfsError('No log file active.')
  elif args['--write']:
    reader = (line for line in sys.stdin) # doesn't work with stdin, why?
    client.write(rpath, reader, overwrite=args['--overwrite'])
  elif args['--read']:
    size = client.status(rpath)['length']
    read(client.read(rpath), size, '%s\t' % (rpath, ))
  elif args['--download']:
    client.download(
      rpath,
      args['LPATH'],
      overwrite=args['--overwrite'],
      n_threads=args['--threads']
    )
  elif args['--upload']:
    client.upload(
      rpath,
      args['LPATH'],
      overwrite=args['--overwrite'],
      n_threads=args['--threads']
    )
  elif args['--list']:
    list_infos(client, rpath, args['--depth'], args['--json'], args['--path'])
  else:
    banner = (
      'Interactive HDFS python shell.\n'
      'Client for %r is available as `CLIENT`.'
      % (client.url, )
    )
    namespace = {'CLIENT': client}
    try:
      from IPython import embed
    except ImportError:
      from code import interact
      interact(banner=banner, local=namespace)
    else:
      embed(banner1=banner, user_ns=namespace)
def run_hive(conf: ConfigData, the_date: str):
    a_client = Client(conf.hdfs_ip())  # "http://10.2.201.197:50070"
    conn = connect(host=conf.hive_ip(),
                   port=conf.hive_port(),
                   auth_mechanism=conf.hive_auth(),
                   user=conf.hive_user())
    cur = conn.cursor()

    print("Start\n")

    the_date = StrTool.get_the_date_str(the_date)  # "20181101"
    # hdfs_dir_bl
    root_path = str(
        pathlib.PurePosixPath(conf.get_hdfs_path()).joinpath(the_date))
    file_name = str(
        pathlib.PurePosixPath(root_path).joinpath(
            conf.get_file_name(the_date)))
    # "/data/posflow/allinpay_utf8_zc/20181101/"
    # 20181101_loginfo_rsp_bl_new.csv
    # 20181101_rsp_agt_bl_new.del
    # 20181101_rxinfo_rsp_bl.txt

    table_name = conf.get_table_name()

    if MyHdfsFile.isfile(a_client, file_name):
        sql = 'LOAD DATA INPATH \'' + file_name + '\' INTO TABLE ' + table_name  # 'test.t1_trxrecprd_v2_zc'
        # '\' OVERWRITE INTO TABLE test.t1_trxrecprd_v2_bl2'
        print("OK" + "  " + sql + "\n")
        cur.execute(sql)  # , async=True)

    cur.close()
    conn.close()
Example #27
0
 def generate_temp_files(need_certificate=NEED_CERTIFICATE):
     if need_certificate:
         with krbcontext(using_keytab=True,
                         keytab_file=KEYTAB_PATH,
                         principal=PRINCIPAL):
             for node in HDFS.NODES:
                 try:
                     hdfs_client = KerberosClient(node)
                     hdfs_client.download(HDFS.REMOTE_PATH,
                                          HDFS.LOCAL_PATH,
                                          n_threads=HDFS.THREAD_NUM)
                 except Exception as err:
                     logging.info(err)
                 else:
                     return
             logging.error("Failed to download remote HDFS file.")
             raise Exception("Failed to download remote HDFS file.")
     else:
         for node in HDFS.NODES:
             try:
                 hdfs_client = Client(node)
                 hdfs_client.download(HDFS.REMOTE_PATH,
                                      HDFS.LOCAL_PATH,
                                      n_threads=HDFS.THREAD_NUM)
             except Exception as err:
                 logging.info(err)
             else:
                 return
         logging.error("Failed to download remote HDFS file.")
         raise Exception("Failed to download remote HDFS file.")
Example #28
0
def get_client(host, use_kerberos):
    if use_kerberos:
        from hdfs.ext.kerberos import KerberosClient
        return KerberosClient(host)
    else:
        from hdfs.client import Client
        return Client(host)
def run_hive(conf: ConfigData, the_date: str):
    client = Client(conf.hdfs_ip())  # "http://10.2.201.197:50070"
    conn = connect(host=conf.hive_ip(), port=conf.hive_port(), auth_mechanism=conf.hive_auth(), user=conf.hive_user())
    cur = conn.cursor()

    the_date = StrTool.get_the_date_str(the_date)  # "20181101"
    root_path = conf.get_data("hdfs_dir_zc")  # "/data/posflow/allinpay_utf8_zc/"
    file_ext3 = conf.get_data("file_ext3")  # _loginfo_rsp.txt          # 20181101_loginfo_rsp.txt
    file_ext4 = conf.get_data("file_ext4")  # _loginfo_rsp_agt.txt      # 20181101_loginfo_rsp_agt.txt
    file_ext5 = conf.get_data("file_ext5")  # _rxinfo_rsp.txt           # 20181101_rxinfo_rsp.txt
    file_ext6 = conf.get_data("file_ext6")  # _rxinfo_rsp_agt.txt       # 20181101_rxinfo_rsp_agt.txt

    print("Start\n")

    file3 = str(pathlib.PurePosixPath(root_path).joinpath(the_date + file_ext3))
    file4 = str(pathlib.PurePosixPath(root_path).joinpath(the_date + file_ext4))
    file5 = str(pathlib.PurePosixPath(root_path).joinpath(the_date + file_ext5))
    file6 = str(pathlib.PurePosixPath(root_path).joinpath(the_date + file_ext6))

    f_list = [file3,file4,file5,file6]
    t_list = ["hive_table3", "hive_table4", "hive_table5", "hive_table6"]

    for n in range(0,4):
        if MyHdfsFile.isfile(client, f_list[n]):
            sql = 'LOAD DATA INPATH \'' + f_list[n] + '\' INTO TABLE ' + conf.get_data(t_list[n])  # 'test.t1_trxrecprd_v2_zc'
            # '\' OVERWRITE INTO TABLE test.t1_trxrecprd_v2_bl2'
            print("OK" + "  " + sql+"\n")
            cur.execute(sql)  # , async=True)

    cur.close()
    conn.close()
 def HDFS_cd(self, hdfs_path):
     """ 切换当前目录, 其实就是重新连接了 """
     m_NewDirectory = Path(os.path.join(self.__m_HDFS_WebFSDir__,
                                        hdfs_path)).as_posix()
     self.__m_HDFS_WebFSDir__ = m_NewDirectory
     self.__m_HDFS_Handler__ = Client(self.__m_HDFS_WebFSURL__,
                                      self.__m_HDFS_WebFSDir__,
                                      session=None)
Example #31
0
 def _get_client(self, addr, port):
     if not CACHE or not self._clients.has_key(addr):
         cli = Client('http://%s:%s' % (str(addr), str(port)))
         if CACHE:
             self._clients.update({addr: cli})
     else:
         cli = self._clients.get(addr)
     return cli
Example #32
0
 def connect(self, host, port):
     conn_url = "http://{}:{}".format(host, port)
     try:
         self.client = Client(conn_url)
         return True
     except Exception, e:
         print "Connect Failed:{}-{}".format(Exception, e)
         return False
 def safe_make_dir(client: Client, to_file: str):
     p = pathlib.PurePosixPath(to_file)  # pathlib.Path(to_file).parents
     if len(p.parts) >= 2:  # type(p) == pathlib._PathParents and
         the_path = p.parts[0]
         for i in range(1, len(p.parts) - 1):
             the_path = pathlib.PurePosixPath(the_path).joinpath(pathlib.PurePosixPath(p.parts[i]))
             the_path_str = str(the_path)
             # os.path.join(the_path, p.parts[i])
             the_dir = client.status(the_path_str, strict=False)
             if the_dir is None:
                 client.makedirs(the_path_str, permission=777)
             #                client.set_owner(thePath,owner='hdfs',group='supergroup')
             else:
                 if the_dir['type'].lower() == 'directory':
                     pass
                 else:
                     return
     return
Example #34
0
def download_parquet_from_hdfs(parquet_path,
                               local_path,
                               hdfs_ip,
                               hdfs_port=50070):
    """
    从hdfs下载parquet文件到local_path
    :param parquet_path: '/data/a.parquet'
    :param local_path: '/data_gen/b.parquet'
    :param hdfs_ip:
    :param hdfs_port:
    :return:
    """
    from hdfs.client import Client
    client = Client(f'http://{hdfs_ip}:{hdfs_port}')
    with client.read(parquet_path) as reader:
        data = reader.read()
    with open(local_path, 'wb') as f:
        f.write(data)
Example #35
0
 def test_autoload_client_from_path(self):
   with temppath() as module_path:
     self._write_client_module(module_path, 'PathClient')
     with temppath() as config_path:
       config = Config(config_path)
       config.add_section(config.global_section)
       config.set(config.global_section, 'autoload.paths', module_path)
       config._autoload()
       client = Client.from_options({'url': ''}, 'PathClient')
       eq_(client.one, 1)
 def is_exist(client: Client, path: str, f_type: int=3):  # 1 file , 2 dir, 3 any
     the_dir = client.status(path, strict=False)
     if the_dir is None:
         return False
     else:
         if (f_type & 1) > 0 and the_dir['type'].lower() == 'file':
             return True
         if (f_type & 2) > 0 and the_dir['type'].lower() == 'directory':
             return True
     return False
Example #37
0
 def test_autoload_client_from_module(self):
   with temppath() as module_dpath:
     os.mkdir(module_dpath)
     sys.path.append(module_dpath)
     module_fpath = osp.join(module_dpath, 'mclient.py')
     self._write_client_module(module_fpath, 'ModuleClient')
     try:
       with temppath() as config_path:
         config = Config(config_path)
         config.add_section(config.global_section)
         config.set(config.global_section, 'autoload.modules', 'mclient')
         config._autoload()
         client = Client.from_options({'url': ''}, 'ModuleClient')
         eq_(client.one, 1)
     finally:
       sys.path.remove(module_dpath)
Example #38
0
def main():
  """Entry point."""
  # arguments parsing first for quicker feedback on invalid arguments
  args = docopt(__doc__, version=__version__)
  # set up logging
  logger = lg.getLogger('hdfs')
  logger.setLevel(lg.DEBUG)
  handler = Config().get_file_handler('hdfs')
  if handler:
    logger.addHandler(handler)
  # set up client and fix arguments
  client = Client.from_alias(args['--alias'])
  rpath = args['RPATH'] or ''
  for option in ('--depth', '--threads'):
    try:
      args[option] = int(args[option])
    except ValueError:
      raise HdfsError('Invalid `%s` option: %r.', option, args[option])
  # run command
  if args['--log']:
    if handler:
      sys.stdout.write('%s\n' % (handler.baseFilename, ))
    else:
      raise HdfsError('No log file active.')
  elif args['--write']:
    reader = (line for line in sys.stdin) # doesn't work with stdin, why?
    client.write(rpath, reader, overwrite=args['--overwrite'])
  elif args['--read']:
    size = client.status(rpath)['length']
    read(client.read(rpath), size, '%s\t' % (rpath, ))
  elif args['--download']:
    client.download(
      rpath,
      args['LPATH'],
      overwrite=args['--overwrite'],
      n_threads=args['--threads']
    )
  else:
    infos(client, rpath, args['--depth'], args['--json'], args['--path'])