Ejemplo n.º 1
0
def hdfs():
    hdfs = HDFileSystem(host='localhost', port=8020)
    if hdfs.exists('/tmp/test'):
        hdfs.rm('/tmp/test')
    hdfs.mkdir('/tmp/test')

    yield hdfs

    if hdfs.exists('/tmp/test'):
        hdfs.rm('/tmp/test')
Ejemplo n.º 2
0
def hdfs():
    hdfs = HDFileSystem(host='localhost', port=8020)
    if hdfs.exists('/tmp/test'):
        hdfs.rm('/tmp/test')
    hdfs.mkdir('/tmp/test')

    yield hdfs

    if hdfs.exists('/tmp/test'):
        hdfs.rm('/tmp/test')
Ejemplo n.º 3
0
def hdfs():
    hdfs = HDFileSystem(host="localhost", port=8020)
    if hdfs.exists("/tmp/test"):
        hdfs.rm("/tmp/test")
    hdfs.mkdir("/tmp/test")

    yield hdfs

    if hdfs.exists("/tmp/test"):
        hdfs.rm("/tmp/test")
Ejemplo n.º 4
0
def make_hdfs():
    hdfs = HDFileSystem(host='localhost', port=8020)
    if hdfs.exists('/tmp/test'):
        hdfs.rm('/tmp/test')
    hdfs.mkdir('/tmp/test')

    try:
        yield hdfs
    finally:
        if hdfs.exists('/tmp/test'):
            hdfs.rm('/tmp/test')
Ejemplo n.º 5
0
def make_hdfs():
    hdfs = HDFileSystem(host='localhost', port=8020)
    if hdfs.exists('/tmp/test'):
        hdfs.rm('/tmp/test')
    hdfs.mkdir('/tmp/test')

    try:
        yield hdfs
    finally:
        if hdfs.exists('/tmp/test'):
            hdfs.rm('/tmp/test')
Ejemplo n.º 6
0
def hdfs():
    hdfs = HDFileSystem(host='localhost', port=8020,
                        pars={'rpc.client.connect.retry': '2'})
    if hdfs.exists('/tmp/test'):
        hdfs.rm('/tmp/test')
    hdfs.mkdir('/tmp/test')

    yield hdfs

    if hdfs.exists('/tmp/test'):
        hdfs.rm('/tmp/test')
    hdfs.disconnect()
Ejemplo n.º 7
0
def hdfs():
    hdfs = HDFileSystem(host=test_host,
                        port=test_port,
                        pars={'rpc.client.connect.retry': '2'})
    if hdfs.exists('/tmp/test'):
        hdfs.rm('/tmp/test')
    hdfs.mkdir('/tmp/test')

    yield hdfs

    if hdfs.exists('/tmp/test'):
        hdfs.rm('/tmp/test', recursive=True)
    hdfs.disconnect()
Ejemplo n.º 8
0
def make_hdfs():
    from hdfs3 import HDFileSystem
    # from .hdfs import DaskHDFileSystem
    basedir = '/tmp/test-distributed'
    hdfs = HDFileSystem(host='localhost', port=8020)
    if hdfs.exists(basedir):
        hdfs.rm(basedir)
    hdfs.mkdir(basedir)

    try:
        yield hdfs, basedir
    finally:
        if hdfs.exists(basedir):
            hdfs.rm(basedir)
Ejemplo n.º 9
0
def make_hdfs():
    from hdfs3 import HDFileSystem
    # from .hdfs import DaskHDFileSystem
    basedir = '/tmp/test-distributed'
    hdfs = HDFileSystem(host='localhost', port=8020)
    if hdfs.exists(basedir):
        hdfs.rm(basedir)
    hdfs.mkdir(basedir)

    try:
        yield hdfs, basedir
    finally:
        if hdfs.exists(basedir):
            hdfs.rm(basedir)
Ejemplo n.º 10
0
def _main_(self):
    # client = Client(url, root=None, proxy=None, timeout=None, session=None)
    # client = Client("http://hadoop:50070")
    # client = InsecureClient("http://10.0.75.1:50070/", user='******')
    # # client = InsecureClient("http://120.78.186.82:50070", user='******');
    hdfs = HDFileSystem(host="10.0.75.1", port='8020')
    path = "/data/"
    hdfs.cancel_token(token=None)  # 未知,求大佬指点
    hdfs.cat(path)  # 获取指定目录或文件的内容
    print(hdfs)
    print(hdfs.exists(path))
    # hdfs.chmod(path, mode)  # 修改制定目录的操作权限
    # hdfs.chown(path, owner, group)  # 修改目录所有者,以及用户组
    # hdfs.concat(destination,
    #             paths)  # 将指定多个路径paths的文件,合并成一个文件写入到destination的路径,并删除源文件(The source files are deleted on successful completion.成功完成后将删除源文件。)
    # hdfs.connect()  # 连接到名称节点 这在启动时自动发生。   LZ:未知作用,按字面意思,应该是第一步HDFileSystem(host='127.0.0.1', port=8020)发生的
    # hdfs.delegate_token(user=None)
    # hdfs.df()  # HDFS系统上使用/空闲的磁盘空间
    # hdfs.disconnect()  # 跟connect()相反,断开连接
    # hdfs.du(path, total=False, deep=False)  # 查看指定目录的文件大小,total是否把大小加起来一个总数,deep是否递归到子目录
    # hdfs.exists(path)  # 路径是否存在
    # hdfs.get(hdfs_path, local_path, blocksize=65536)  # 将HDFS文件复制到本地,blocksize设置一次读取的大小
    # hdfs.get_block_locations(path, start=0, length=0)  # 获取块的物理位置
    # hdfs.getmerge(path, filename, blocksize=65536)  # 获取制定目录下的所有文件,复制合并到本地文件
    # hdfs.glob(path)  # /user/spark/abc-*.txt 获取与这个路径相匹配的路径列表
    # hdfs.head(path, size=1024)  # 获取指定路径下的文件头部分的数据
    # hdfs.info(path)  # 获取指定路径文件的信息
    # hdfs.isdir(path)  # 判断指定路径是否是一个文件夹
    # hdfs.isfile(path)  # 判断指定路径是否是一个文件
    # hdfs.list_encryption_zones()  # 获取所有加密区域的列表
    # hdfs.ls(path, detail=False)  # 返回指定路径下的文件路径,detail文件详细信息
    # hdfs.makedirs(path, mode=457)  # 创建文件目录类似 mkdir -p
    # hdfs.mkdir(path)  # 创建文件目录
    # hdfs.mv(path1, path2)  # 将path1移动到path2
    # open(path, mode='rb', replication=0, buff=0, block_size=0)  # 读取文件,类似于python的文件读取
    # hdfs.put(filename, path, chunk=65536, replication=0, block_size=0)  # 将本地的文件上传到,HDFS指定目录
    # hdfs.read_block(fn, offset, length,
    #                 delimiter=None)  # 指定路径文件的offset指定读取字节的起始点,length读取长度,delimiter确保读取在分隔符bytestring上开始和停止
    # hdfs.read_block('/data/file.csv', 0, 13)
    # hdfs.read_block('/data/file.csv', 0, 13, delimiter=b'\n')
    # hdfs.rm(path, recursive=True)  # 删除指定路径recursive是否递归删除
    # hdfs.tail(path, size=1024)  # 获取 文件最后一部分的数据
    # hdfs.touch(path)  # 创建一个空文件
    # hdfs.walk(path)  # 遍历文件树
    # print(client)
    # hdfs.put_to_hdfs(client, 'F:\\Maven\\work\\CDES\\code\\LogDemo', '/data')
Ejemplo n.º 11
0
 def insert_csv(self,
                table_name,
                name_node_url,
                name_node_options,
                csv_file,
                df_flag,
                autolimit,
                displaylimit=100):
     """
     Enables insertion of CSVs or DataFrames to Hive.
     """
     if self.cluster_details:
         if 'name_node_url' in self.cluster_details:
             name_node_url = self.cluster_details['name_node_url']
         if 'name_node_opts' in self.cluster_details:
             name_node_options = self.cluster_details['name_node_opts']
     csv_file_name = csv_file.split('/')[-1]
     folder_name = csv_file_name.split('.')[0]
     hdfs_location = '/user/{}/notebooks/{}'.format(getpass.getuser(),
                                                    folder_name)
     hdfs = HDFileSystem(host=name_node_url, pars=name_node_options)
     if not hdfs.exists(hdfs_location):
         hdfs.mkdir(hdfs_location)
     hdfs_file_location = '{}/{}'.format(hdfs_location, csv_file_name)
     hdfs.put(csv_file, hdfs_file_location)
     data_type = HiveConnection.csv_datatypes(csv_file)
     data_type_list = ',\n'.join(
         ['%s %s' % (key, value) for (key, value) in data_type.items()])
     create_table_command = "CREATE EXTERNAL TABLE {} ({})" \
                            "\nROW FORMAT DELIMITED\nFIELDS TERMINATED BY " \
                            "','\nSTORED AS TEXTFILE\nLOCATION '{}'\n" \
                            "tblproperties(" \
                            "\"skip.header.line.count\"=\"1\");"\
         .format(table_name, data_type_list, hdfs_location)
     drop_table_command = 'DROP TABLE IF EXISTS {};'.format(table_name)
     command = drop_table_command + '\n' + create_table_command
     if df_flag:
         os.system("rm %s" % csv_file)
     return self.execute(command, autolimit, displaylimit)
Ejemplo n.º 12
0
'''
Created on Jun 10, 2017

@author: SathishParthasarathy
'''

from pyspark import SparkConf, SparkContext
from hdfs3 import HDFileSystem
if __name__ == '__main__':
    conf = SparkConf().setAppName("Word Count - Python")
    spark = SparkContext(conf=conf)
    hdfs = HDFileSystem('hadoop.master.com', port=9000)
    if hdfs.exists("/user/psathishcs/Output/Books/Science_Python") != True:
        text_file = spark.textFile(
            "hdfs://hadoop.master.com:9000/user/psathishcs/Input/Books/The_Outline_of_Science.txt"
        )
        words = text_file.flatMap(lambda line: line.split())
        wordCounts = words.map(lambda word: (word, 1)).reduceByKey(
            lambda a, b: a + b)
        wordCounts.saveAsTextFile(
            "hdfs://hadoop.master.com:9000/user/psathishcs/Output/Books/Science_Python"
        )
Ejemplo n.º 13
0
if __name__ == "__main__":
       todayStr1 = datetime.strftime(datetime.now(), '%Y%m%d')
       todayStr2 = datetime.strftime(datetime.now(), '%Y-%m-%d')
       hdfs = HDFileSystem(host='10.0.1.218', port=9000)
       # df_org_features = org_features(df_org, df_holidayList, df_holidayTOdate)
       df_org, df_holidayList, df_holidayTOdate = spark_readData()
       print('read data is OK!!!')
       df_orgFeatures = org_features(df_org, df_holidayList, df_holidayTOdate)
       print('df_orgFeatures is OK!!!')
       df_buildFeatures = build_features(df_orgFeatures)
       print('df_buildFeatures is OK!!!')
       lr_historyData = lr_history_data(df_buildFeatures)
       print('lr_historyData is OK!!!')
       train_data, val_data = train_val_data(lr_historyData)
       print("train_data and val_data is OK!!!")
       # mseError_DF = lr_trainModel(train_data, val_data)
       # mseError_DF.show()
       # print('LR_Model is OK!!!')
       org_predictData = org_predictData(df_buildFeatures, df_holidayList)
       print("org_predictData is OK!!!")
       lr_predictData = lr_predictData(org_predictData)
       if hdfs.exists('hdfs://10.0.1.218:9000/predict-2019/predict_data_20190624.parquet'):
              hdfs.rm('hdfs://10.0.1.218:9000/predict-2019/predict_data_20190624.parquet')
       lr_predictData.write.format('parquet').save('hdfs://10.0.1.218:9000/predict-2019/predict_data_20190624.parquet')
       print("lr_predictData is OK!!!")





Ejemplo n.º 14
0
#!/bin/env python
# -*- coding: utf-8 -*-


import os.path
from hdfs3 import HDFileSystem
import config

print 'NameNode host:', config.NAMENODE_HOST
print 'NameNode port:', config.NAMENODE_PORT

client = HDFileSystem(host=config.NAMENODE_HOST, port=config.NAMENODE_PORT)
remote_dir = os.path.dirname(config.RFILE_FMT)
if not client.exists(remote_dir):
    client.mkdir(remote_dir)

for day in range(0, config.DAYS):
    src = "".join([config.LFILE_FMT, str(day)])
    dst = "".join([config.RFILE_FMT, str(day)])

    if not os.path.exists(src):
        print 'Skipping:', src, 'file not found!'
        continue

    if client.exists(dst):
        print 'Skipping:', dst, 'file already exists: hadoop fs -rm', dst
        continue

    print 'Uploading', src, '=>', dst
    client.put(src, dst)
Ejemplo n.º 15
0
class KafkaConsumer(object):

    def __init__(self, topics):
        # self.consumer = KafkaConsumer(
        #     topics,
        #     bootstrap_servers=['localhost:9092'],
        #     auto_offset_reset='earliest',
        #     enable_auto_commit=True,
        #     group_id='my-group',
        #     value_deserializer=lambda x: loads(x.decode('utf-8')))

        # self.HDFS = pa.HDFS.connect(host='localhost', port=9000)
        self.hdfs = HDFileSystem(host='localhost', port=9000)
        self.destination_path = ""
        self.dest_path_tweet = '/user/BigData/tweet_data'
        self.dest_path_rss = '/user/BigData/rss_data'
        self.dest_path_corona = '/user/BigData/corona_data'
        self.temp_path = 'temp_data'

    def write_to_file(self, source_df, data_type="tweet"):
        """
        Write collected data to HDFS.

        If file is collected on the same day, method will retrieve the previous dataframe
        combine it and write back to HDFS.

        Ref:    https://hdfs3.readthedocs.io/en/latest/

        :param source_df: Dataframe to be written
        :param data_type: "tweet", "rss", "corona" only these 3
        :return:
        """
        try:
            if len(source_df) > 0:
                if "tweet" in str(data_type).lower():
                    self.destination_path = self.dest_path_tweet
                elif "rss" in str(data_type).lower():
                    self.destination_path = self.dest_path_rss
                elif "corona" in str(data_type).lower():
                    self.destination_path = self.dest_path_corona
                else:
                    raise Exception("Invalid data type, unsure where to storage in HDFS.")

                # write to temp storage
                file_name = 'temp.csv'
                temp_path = os.path.join(self.temp_path, file_name)
                source_df.to_csv(temp_path)

                # check see if existing csv if yes combine them
                date_str = datetime.now().strftime("%d-%m-%Y")
                file_name = 'tweets_{0}.csv'.format(date_str)
                hdfs_path = os.path.join(self.destination_path, file_name)
                if self.hdfs.exists(hdfs_path):
                    with self.hdfs.open(hdfs_path) as f:
                        exist_df = pd.read_csv(f)
                        source_df = pd.concat([source_df, exist_df])
                        self.hdfs.rm(hdfs_path)  # remove and write a new one

                # pushing to HDFS
                self.hdfs.put(temp_path, hdfs_path)

                print("Write to HDFS completed: ", source_df.shape)
        except Exception as e:
            print(str(e))
Ejemplo n.º 16
0
class HadoopFileSystem(FileSystem):
  """``FileSystem`` implementation that supports HDFS.

  URL arguments to methods expect strings starting with ``hdfs://``.

  Uses client library :class:`hdfs3.core.HDFileSystem`.
  """

  def __init__(self):
    """Initializes a connection to HDFS.

    Connection configuration is done using :doc:`hdfs`.
    """
    super(HadoopFileSystem, self).__init__()
    self._hdfs_client = HDFileSystem()

  @classmethod
  def scheme(cls):
    return 'hdfs'

  @staticmethod
  def _parse_url(url):
    """Verifies that url begins with hdfs:// prefix, strips it and adds a
    leading /.

    Raises:
      ValueError if url doesn't begin with hdfs://.

    Args:
      url: A URL in the form hdfs://path/...

    Returns:
      For an input of 'hdfs://path/...', will return '/path/...'.
    """
    m = _URL_RE.match(url)
    if m is None:
      raise ValueError('Could not parse url: %s' % url)
    return m.group(1)

  def join(self, base_url, *paths):
    """Join two or more pathname components.

    Args:
      base_url: string path of the first component of the path.
        Must start with hdfs://.
      paths: path components to be added

    Returns:
      Full url after combining all the passed components.
    """
    basepath = self._parse_url(base_url)
    return _HDFS_PREFIX + self._join(basepath, *paths)

  def _join(self, basepath, *paths):
    return posixpath.join(basepath, *paths)

  def split(self, url):
    rel_path = self._parse_url(url)
    head, tail = posixpath.split(rel_path)
    return _HDFS_PREFIX + head, tail

  def mkdirs(self, url):
    path = self._parse_url(url)
    if self._exists(path):
      raise IOError('Path already exists: %s' % path)
    return self._mkdirs(path)

  def _mkdirs(self, path):
    self._hdfs_client.makedirs(path)

  def match(self, url_patterns, limits=None):
    if limits is None:
      limits = [None] * len(url_patterns)

    if len(url_patterns) != len(limits):
      raise BeamIOError(
          'Patterns and limits should be equal in length: %d != %d' % (
              len(url_patterns), len(limits)))

    # TODO(udim): Update client to allow batched results.
    def _match(path_pattern, limit):
      """Find all matching paths to the pattern provided."""
      file_infos = self._hdfs_client.ls(path_pattern, detail=True)[:limit]
      metadata_list = [FileMetadata(file_info['name'], file_info['size'])
                       for file_info in file_infos]
      return MatchResult(path_pattern, metadata_list)

    exceptions = {}
    result = []
    for url_pattern, limit in zip(url_patterns, limits):
      try:
        path_pattern = self._parse_url(url_pattern)
        result.append(_match(path_pattern, limit))
      except Exception as e:  # pylint: disable=broad-except
        exceptions[url_pattern] = e

    if exceptions:
      raise BeamIOError('Match operation failed', exceptions)
    return result

  def _open_hdfs(self, path, mode, mime_type, compression_type):
    if mime_type != 'application/octet-stream':
      logging.warning('Mime types are not supported. Got non-default mime_type:'
                      ' %s', mime_type)
    if compression_type == CompressionTypes.AUTO:
      compression_type = CompressionTypes.detect_compression_type(path)
    res = self._hdfs_client.open(path, mode)
    if compression_type != CompressionTypes.UNCOMPRESSED:
      res = CompressedFile(res)
    return res

  def create(self, url, mime_type='application/octet-stream',
             compression_type=CompressionTypes.AUTO):
    """
    Returns:
      *hdfs3.core.HDFile*: An Python File-like object.
    """
    path = self._parse_url(url)
    return self._create(path, mime_type, compression_type)

  def _create(self, path, mime_type='application/octet-stream',
              compression_type=CompressionTypes.AUTO):
    return self._open_hdfs(path, 'wb', mime_type, compression_type)

  def open(self, url, mime_type='application/octet-stream',
           compression_type=CompressionTypes.AUTO):
    """
    Returns:
      *hdfs3.core.HDFile*: An Python File-like object.
    """
    path = self._parse_url(url)
    return self._open(path, mime_type, compression_type)

  def _open(self, path, mime_type='application/octet-stream',
            compression_type=CompressionTypes.AUTO):
    return self._open_hdfs(path, 'rb', mime_type, compression_type)

  def copy(self, source_file_names, destination_file_names):
    """
    Will overwrite files and directories in destination_file_names.

    Raises ``BeamIOError`` if any error occurred.

    Args:
      source_file_names: iterable of URLs.
      destination_file_names: iterable of URLs.
    """
    if len(source_file_names) != len(destination_file_names):
      raise BeamIOError(
          'source_file_names and destination_file_names should '
          'be equal in length: %d != %d' % (
              len(source_file_names), len(destination_file_names)))

    def _copy_file(source, destination):
      with self._open(source) as f1:
        with self._create(destination) as f2:
          while True:
            buf = f1.read(_COPY_BUFFER_SIZE)
            if not buf:
              break
            f2.write(buf)

    def _copy_path(source, destination):
      """Recursively copy the file tree from the source to the destination."""
      if not self._hdfs_client.isdir(source):
        _copy_file(source, destination)
        return

      for path, dirs, files in self._hdfs_client.walk(source):
        for dir in dirs:
          new_dir = self._join(destination, dir)
          if not self._exists(new_dir):
            self._mkdirs(new_dir)

        rel_path = posixpath.relpath(path, source)
        if rel_path == '.':
          rel_path = ''
        for file in files:
          _copy_file(self._join(path, file),
                     self._join(destination, rel_path, file))

    exceptions = {}
    for source, destination in zip(source_file_names, destination_file_names):
      try:
        rel_source = self._parse_url(source)
        rel_destination = self._parse_url(destination)
        _copy_path(rel_source, rel_destination)
      except Exception as e:  # pylint: disable=broad-except
        exceptions[(source, destination)] = e

    if exceptions:
      raise BeamIOError('Copy operation failed', exceptions)

  def rename(self, source_file_names, destination_file_names):
    exceptions = {}
    for source, destination in zip(source_file_names, destination_file_names):
      try:
        rel_source = self._parse_url(source)
        rel_destination = self._parse_url(destination)
        if not self._hdfs_client.mv(rel_source, rel_destination):
          raise BeamIOError(
              'libhdfs error in renaming %s to %s' % (source, destination))
      except Exception as e:  # pylint: disable=broad-except
        exceptions[(source, destination)] = e

    if exceptions:
      raise BeamIOError('Rename operation failed', exceptions)

  def exists(self, url):
    """Checks existence of url in HDFS.

    Args:
      url: String in the form hdfs://...

    Returns:
      True if url exists as a file or directory in HDFS.
    """
    path = self._parse_url(url)
    return self._exists(path)

  def _exists(self, path):
    """Returns True if path exists as a file or directory in HDFS.

    Args:
      path: String in the form /...
    """
    return self._hdfs_client.exists(path)

  def delete(self, urls):
    exceptions = {}
    for url in urls:
      try:
        path = self._parse_url(url)
        self._hdfs_client.rm(path, recursive=True)
      except Exception as e:  # pylint: disable=broad-except
        exceptions[url] = e

    if exceptions:
      raise BeamIOError("Delete operation failed", exceptions)
Ejemplo n.º 17
0
class HDFSUtil(object):
    """
    Reference: https://hdfs3.readthedocs.io/en/latest/api.html

    HDFS utility handles read, write, delete files
    """

    def __init__(self):
        self.hdfs = HDFileSystem(host='localhost', port=9000)
        self.dest_path_tweet = '/user/BigData/tweet_data'
        self.dest_path_rss = '/user/BigData/rss_data'
        self.dest_path_corona = '/user/BigData/corona_data'
        self.destination_path = ""
        self.import_path = '../import_data'
        self.import_path_tweet = os.path.join(self.import_path, 'tweets')
        self.import_path_rss = os.path.join(self.import_path, 'rss')
        self.import_path_corona = os.path.join(self.import_path, 'corona')

        self.hdfs_types = {'tweet': self.dest_path_tweet, 'rss': self.dest_path_rss, 'corona': self.dest_path_corona}
        self.import_types = {'tweet': self.import_path_tweet, 'rss': self.import_path_rss,
                             'corona': self.import_path_corona}
        self.temp_types = {'tweet': "TempData/temp_tweet.csv", "rss": "TempData/temp_rss.csv",
                           "corona": "TempData/temp_corona.csv"}

    def get_files(self, data_type):
        """
        Return a list of files contain inside HDFS.

        :param data_type: in string only accept 'tweet', 'rss', 'corona'

        :return: list of files stored in HDFS
        """
        if "tweet" == str(data_type).lower():
            self.destination_path = self.dest_path_tweet
        elif "rss" == str(data_type).lower():
            self.destination_path = self.dest_path_rss
        elif "corona" == str(data_type).lower():
            self.destination_path = self.dest_path_corona
        else:
            raise Exception("Invalid data type, check if input string is correct.")

        if self.hdfs.exists(self.destination_path):
            return self.hdfs.ls(self.destination_path)

    def import_local_data(self, overwrite=False):
        """
        Import files from local storage folder "import_data".

        Will print out files that is being push to HDFS.

        :return: None
        """
        for data_type in self.import_types.keys():
            try:
                hdfs_files = self.get_files(data_type)
                local_folder = self.import_types[data_type]
                onlyfiles = [f for f in listdir(local_folder) if isfile(join(local_folder, f) and ".~" not in f)]

                for file in onlyfiles:
                    try:
                        if overwrite:
                            dest_path = self.hdfs_types[data_type]
                            self.hdfs.put(os.path.join(local_folder, file), os.path.join(dest_path, file))
                            print(f"Write to HDFS: {os.path.join(dest_path, file)}")
                        else:
                            if sum([file in f for f in hdfs_files]) == 0 or len(hdfs_files) == 0:
                                dest_path = self.hdfs_types[data_type]
                                self.hdfs.put(os.path.join(local_folder, file), os.path.join(dest_path, file))
                                print(f"Write to HDFS: {os.path.join(dest_path, file)}")
                    except Exception as e:
                        print(str(e))

            except Exception as e:
                print(str(e))

    def delete_file(self, file_name, data_type=None):
        """
        Attempt to delete file in HDFS by file name.

        :param file_name: Case sensitive
        :param data_type: Which data type to delete from 'tweet', 'rss', 'corona'
        :return: True if successful else False
        """
        try:
            del_count = 0
            for mdata_type in self.hdfs_types.keys():
                if data_type:
                    if mdata_type == data_type:
                        hdfs_path = self.hdfs_types[mdata_type]
                        hdfs_path = os.path.join(hdfs_path, file_name)
                        if self.hdfs.exists(hdfs_path):
                            self.hdfs.rm(hdfs_path)
                            print(f"File deleted: {file_name}")
                            del_count += 1
                else:
                    hdfs_path = self.hdfs_types[mdata_type]
                    hdfs_path = os.path.join(hdfs_path, file_name)
                    if self.hdfs.exists(hdfs_path):
                        self.hdfs.rm(hdfs_path)
                        print(f"File deleted: {file_name}")
                        del_count += 1

            if del_count == 0:
                print(f"Could not find file in HDFS: {file_name}")
                return False
            else:
                return True
        except Exception as e:
            print(str(e))
            return False

    def is_file_exist(self, file_name):
        try:
            for data_type in self.hdfs_types.keys():
                hdfs_path = self.hdfs_types[data_type]
                hdfs_path = os.path.join(hdfs_path, file_name)
                if self.hdfs.exists(hdfs_path):
                    return True
            return False
        except Exception as e:
            print(str(e))

    def read_file(self, file_name):
        """
        Return the DataFrame load from HDFS

        :param file_name: Case sensitive
        :return: DataFrame
        """
        try:
            for data_type in self.hdfs_types.keys():
                hdfs_path = self.hdfs_types[data_type]
                hdfs_path = os.path.join(hdfs_path, file_name)
                if self.hdfs.exists(hdfs_path):
                    with self.hdfs.open(hdfs_path) as f:
                        df = pd.read_csv(f)
                        return df
        except Exception as e:
            print(str(e))

    def read_files(self, file_names):
        raise Exception("Not implemented")

    def write_file(self, source_file, file_name):
        try:
            for data_type in self.import_types.keys():
                if data_type in source_file:
                    dest_path = self.hdfs_types[data_type]
                    # pushing to HDFS
                    self.hdfs.put(source_file, os.path.join(dest_path, file_name))
                    print(f"Write to file: {os.path.join(dest_path, file_name)}")
        except Exception as e:
            print(str(e))
Ejemplo n.º 18
0
class HDFSUtil(object):
    """
    Reference: https://hdfs3.readthedocs.io/en/latest/api.html

    HDFS utility handles read, write, delete files
    """
    def __init__(self):
        self.hdfs = HDFileSystem(host='localhost', port=9000)
        self.dest_path_tweet = '/user/BigData/tweet_data'
        self.dest_path_rss = '/user/BigData/rss_data'
        self.dest_path_corona = '/user/BigData/corona_data'
        self.destination_path = ""
        self.import_path = '../import_data'
        self.import_path_tweet = os.path.join(self.import_path, 'tweets')
        self.import_path_rss = os.path.join(self.import_path, 'rss')
        self.import_path_corona = os.path.join(self.import_path, 'corona')

        self.hdfs_types = {
            'tweet': self.dest_path_tweet,
            'rss': self.dest_path_rss,
            'corona': self.dest_path_corona
        }
        self.import_types = {
            'tweet': self.import_path_tweet,
            'rss': self.import_path_rss,
            'corona': self.import_path_corona
        }
        self.temp_types = {
            'tweet': "TempData/temp_tweet.csv",
            "rss": "TempData/temp_rss.csv",
            "corona": "TempData/temp_corona.csv"
        }

    def get_files(self, data_type):
        """
        Return a list of files contain inside HDFS.

        :param data_type: in string only accept 'tweet', 'rss', 'corona'

        :return: list of files stored in HDFS
        """
        if "tweet" == str(data_type).lower():
            self.destination_path = self.dest_path_tweet
        elif "rss" == str(data_type).lower():
            self.destination_path = self.dest_path_rss
        elif "corona" == str(data_type).lower():
            self.destination_path = self.dest_path_corona
        else:
            raise Exception(
                "Invalid data type, check if input string is correct.")

        if self.hdfs.exists(self.destination_path):
            return self.hdfs.ls(self.destination_path)

    def import_local_data(self, overwrite=False):
        """
        Import files from local storage folder "import_data".

        Will print out files that is being push to HDFS.

        :return: None
        """
        for data_type in self.import_types.keys():
            try:
                hdfs_files = self.get_files(data_type)
                local_folder = self.import_types[data_type]
                onlyfiles = [
                    f for f in listdir(local_folder)
                    if isfile(os.path.join(local_folder, f)) and ".~" not in f
                ]

                for file in onlyfiles:
                    try:
                        if overwrite:
                            dest_path = self.hdfs_types[data_type]
                            self.hdfs.put(os.path.join(local_folder, file),
                                          os.path.join(dest_path, file))
                            print(
                                f"Write to HDFS: {os.path.join(dest_path, file)}"
                            )
                        else:
                            if sum([file in f for f in hdfs_files
                                    ]) == 0 or len(hdfs_files) == 0:
                                dest_path = self.hdfs_types[data_type]
                                self.hdfs.put(os.path.join(local_folder, file),
                                              os.path.join(dest_path, file))
                                print(
                                    f"Write to HDFS: {os.path.join(dest_path, file)}"
                                )
                    except Exception as e:
                        print(str(e))

            except Exception as e:
                print(str(e))

    def delete_file(self, file_name, data_type=None):
        """
        Attempt to delete file in HDFS by file name.

        :param file_name: Case sensitive
        :param data_type: Which data type to delete from 'tweet', 'rss', 'corona'
        :return: True if successful else False
        """
        try:
            del_count = 0
            for mdata_type in self.hdfs_types.keys():
                if data_type:
                    if mdata_type == data_type:
                        hdfs_path = self.hdfs_types[mdata_type]
                        hdfs_path = os.path.join(hdfs_path, file_name)
                        if self.hdfs.exists(hdfs_path):
                            self.hdfs.rm(hdfs_path)
                            print(f"File deleted: {file_name}")
                            del_count += 1
                else:
                    hdfs_path = self.hdfs_types[mdata_type]
                    hdfs_path = os.path.join(hdfs_path, file_name)
                    if self.hdfs.exists(hdfs_path):
                        self.hdfs.rm(hdfs_path)
                        print(f"File deleted: {file_name}")
                        del_count += 1

            if del_count == 0:
                print(f"Could not find file in HDFS: {file_name}")
                return False
            else:
                return True
        except Exception as e:
            print(str(e))
            return False

    def delete_all_file(self):
        del_count = 0
        for data_type in self.import_types.keys():
            try:
                hdfs_files = self.get_files(data_type)
                for file in hdfs_files:
                    if self.hdfs.exists(file):
                        self.hdfs.rm(file)
                        print(f"File deleted: {file}")
                        del_count += 1
            except Exception as e:
                print(str(e))
        print(f"Total files deleted: {del_count}")

    def is_file_exist(self, file_name):
        try:
            for data_type in self.hdfs_types.keys():
                hdfs_path = self.hdfs_types[data_type]
                hdfs_path = os.path.join(hdfs_path, file_name)
                if self.hdfs.exists(hdfs_path):
                    return True
            return False
        except Exception as e:
            print(str(e))

    def read_file_date(self, start_date, end_date=None, data_type="tweet"):
        """
        Reading file from specific dates

        "start_date": Starting date inclusive, have to be in 31-03-2020 format.

        "end_date": Nullable field. Ending date inclusive, have to be in 31-03-2020 format.

        :param start_date: { "start_date": 31-03-2020 }
        :param end_date:  { "start_date": 31-03-2020, "end_date": 30-04-2020 }
        :param data_type: "tweet", "rss"
        :return: pandas data frame
        """

        start_date = datetime.strptime(start_date, "%d-%m-%Y")
        if end_date:
            end_date = datetime.strptime(end_date, "%d-%m-%Y")
        else:
            end_date = start_date

        # Getting all files with start end date
        files = self.get_files(data_type)
        files_datetime = []
        for f in files:
            try:
                f = f.split("/")[-1]
                datetimeStr = f.split("_")[1]
                datetimeStr = datetimeStr.split(".")[0]
                date = datetime.strptime(datetimeStr, "%d-%m-%Y")
                files_datetime.append(date)
            except Exception as e:
                print(str(e))

        # Append files to be loaded
        result_files = []
        for file, date in zip(files, files_datetime):
            if start_date <= date <= end_date:
                result_files.append(file)

        if len(result_files) == 0:
            return None, None

        # Read and combine all dataframe
        max_files = 3
        df_list = []
        for mfile in result_files[0:max_files]:
            if self.hdfs.exists(mfile):
                with self.hdfs.open(mfile) as file:
                    df = pd.read_csv(file)
                    df_list.append(df)
        df = pd.concat(df_list)

        if data_type == "tweet":
            for idx, row in df.iterrows():
                if str(row['extended_tweet']) != "nan":
                    df['text'].iloc[idx] = row['extended_tweet']

        p_schema = self.pandas_to_spark_schema(df)
        return df, p_schema

    def read_file_dataframe(self, file_name):
        """
        Return the DataFrame load from HDFS

        :param file_name: Case sensitive
        :return: DataFrame, DataFrame_Schema
        """
        try:
            for data_type in self.hdfs_types.keys():
                hdfs_path = self.hdfs_types[data_type]
                hdfs_path = os.path.join(hdfs_path, file_name)
                if self.hdfs.exists(hdfs_path):
                    with self.hdfs.open(hdfs_path) as f:
                        df = pd.read_csv(f)
                        p_schema = self.pandas_to_spark_schema(df)
                        return df, p_schema
        except Exception as e:
            print(str(e))

    def read_file(self, file_name):
        try:
            for data_type in self.hdfs_types.keys():
                hdfs_path = self.hdfs_types[data_type]
                hdfs_path = os.path.join(hdfs_path, file_name)
                if self.hdfs.exists(hdfs_path):
                    return hdfs_path
        except Exception as e:
            print(str(e))

    # Auxiliar functions
    def equivalent_type(self, f):
        if f == 'datetime64[ns]':
            return DateType()
        elif f == 'int64':
            return LongType()
        elif f == 'int32':
            return IntegerType()
        elif f == 'float64':
            return FloatType()
        else:
            return StringType()

    def define_structure(self, string, format_type):
        try:
            typo = self.equivalent_type(format_type)
        except:
            typo = StringType()
        return StructField(string, typo)

    # Given pandas dataframe, it will return a spark's dataframe.
    def pandas_to_spark_schema(self, pandas_df):
        columns = list(pandas_df.columns)
        types = list(pandas_df.dtypes)
        struct_list = []
        for column, typo in zip(columns, types):
            struct_list.append(self.define_structure(column, typo))
        p_schema = StructType(struct_list)
        return p_schema

    def write_file(self, source_file, file_name):
        try:
            for data_type in self.import_types.keys():
                if data_type in source_file:
                    dest_path = self.hdfs_types[data_type]
                    # pushing to HDFS
                    self.hdfs.put(source_file,
                                  os.path.join(dest_path, file_name))
                    print(
                        f"Write to file: {os.path.join(dest_path, file_name)}")
        except Exception as e:
            print(str(e))
Ejemplo n.º 19
0
class HadoopFileSystem(FileSystem):
    """``FileSystem`` implementation that supports HDFS.

  URL arguments to methods expect strings starting with ``hdfs://``.

  Uses client library :class:`hdfs3.core.HDFileSystem`.
  """
    def __init__(self, pipeline_options):
        """Initializes a connection to HDFS.

    Connection configuration is done using :doc:`hdfs`.
    """
        super(HadoopFileSystem, self).__init__(pipeline_options)
        self._hdfs_client = HDFileSystem()

    @classmethod
    def scheme(cls):
        return 'hdfs'

    @staticmethod
    def _parse_url(url):
        """Verifies that url begins with hdfs:// prefix, strips it and adds a
    leading /.

    Raises:
      ValueError if url doesn't begin with hdfs://.

    Args:
      url: A URL in the form hdfs://path/...

    Returns:
      For an input of 'hdfs://path/...', will return '/path/...'.
    """
        m = _URL_RE.match(url)
        if m is None:
            raise ValueError('Could not parse url: %s' % url)
        return m.group(1)

    def join(self, base_url, *paths):
        """Join two or more pathname components.

    Args:
      base_url: string path of the first component of the path.
        Must start with hdfs://.
      paths: path components to be added

    Returns:
      Full url after combining all the passed components.
    """
        basepath = self._parse_url(base_url)
        return _HDFS_PREFIX + self._join(basepath, *paths)

    def _join(self, basepath, *paths):
        return posixpath.join(basepath, *paths)

    def split(self, url):
        rel_path = self._parse_url(url)
        head, tail = posixpath.split(rel_path)
        return _HDFS_PREFIX + head, tail

    def mkdirs(self, url):
        path = self._parse_url(url)
        if self._exists(path):
            raise IOError('Path already exists: %s' % path)
        return self._mkdirs(path)

    def _mkdirs(self, path):
        self._hdfs_client.makedirs(path)

    def match(self, url_patterns, limits=None):
        if limits is None:
            limits = [None] * len(url_patterns)

        if len(url_patterns) != len(limits):
            raise BeamIOError(
                'Patterns and limits should be equal in length: %d != %d' %
                (len(url_patterns), len(limits)))

        # TODO(udim): Update client to allow batched results.
        def _match(path_pattern, limit):
            """Find all matching paths to the pattern provided."""
            file_infos = self._hdfs_client.ls(path_pattern,
                                              detail=True)[:limit]
            metadata_list = [
                FileMetadata(file_info['name'], file_info['size'])
                for file_info in file_infos
            ]
            return MatchResult(path_pattern, metadata_list)

        exceptions = {}
        result = []
        for url_pattern, limit in zip(url_patterns, limits):
            try:
                path_pattern = self._parse_url(url_pattern)
                result.append(_match(path_pattern, limit))
            except Exception as e:  # pylint: disable=broad-except
                exceptions[url_pattern] = e

        if exceptions:
            raise BeamIOError('Match operation failed', exceptions)
        return result

    def _open_hdfs(self, path, mode, mime_type, compression_type):
        if mime_type != 'application/octet-stream':
            logging.warning(
                'Mime types are not supported. Got non-default mime_type:'
                ' %s', mime_type)
        if compression_type == CompressionTypes.AUTO:
            compression_type = CompressionTypes.detect_compression_type(path)
        res = self._hdfs_client.open(path, mode)
        if compression_type != CompressionTypes.UNCOMPRESSED:
            res = CompressedFile(res)
        return res

    def create(self,
               url,
               mime_type='application/octet-stream',
               compression_type=CompressionTypes.AUTO):
        """
    Returns:
      *hdfs3.core.HDFile*: An Python File-like object.
    """
        path = self._parse_url(url)
        return self._create(path, mime_type, compression_type)

    def _create(self,
                path,
                mime_type='application/octet-stream',
                compression_type=CompressionTypes.AUTO):
        return self._open_hdfs(path, 'wb', mime_type, compression_type)

    def open(self,
             url,
             mime_type='application/octet-stream',
             compression_type=CompressionTypes.AUTO):
        """
    Returns:
      *hdfs3.core.HDFile*: An Python File-like object.
    """
        path = self._parse_url(url)
        return self._open(path, mime_type, compression_type)

    def _open(self,
              path,
              mime_type='application/octet-stream',
              compression_type=CompressionTypes.AUTO):
        return self._open_hdfs(path, 'rb', mime_type, compression_type)

    def copy(self, source_file_names, destination_file_names):
        """
    Will overwrite files and directories in destination_file_names.

    Raises ``BeamIOError`` if any error occurred.

    Args:
      source_file_names: iterable of URLs.
      destination_file_names: iterable of URLs.
    """
        if len(source_file_names) != len(destination_file_names):
            raise BeamIOError(
                'source_file_names and destination_file_names should '
                'be equal in length: %d != %d' %
                (len(source_file_names), len(destination_file_names)))

        def _copy_file(source, destination):
            with self._open(source) as f1:
                with self._create(destination) as f2:
                    while True:
                        buf = f1.read(_COPY_BUFFER_SIZE)
                        if not buf:
                            break
                        f2.write(buf)

        def _copy_path(source, destination):
            """Recursively copy the file tree from the source to the destination."""
            if not self._hdfs_client.isdir(source):
                _copy_file(source, destination)
                return

            for path, dirs, files in self._hdfs_client.walk(source):
                for dir in dirs:
                    new_dir = self._join(destination, dir)
                    if not self._exists(new_dir):
                        self._mkdirs(new_dir)

                rel_path = posixpath.relpath(path, source)
                if rel_path == '.':
                    rel_path = ''
                for file in files:
                    _copy_file(self._join(path, file),
                               self._join(destination, rel_path, file))

        exceptions = {}
        for source, destination in zip(source_file_names,
                                       destination_file_names):
            try:
                rel_source = self._parse_url(source)
                rel_destination = self._parse_url(destination)
                _copy_path(rel_source, rel_destination)
            except Exception as e:  # pylint: disable=broad-except
                exceptions[(source, destination)] = e

        if exceptions:
            raise BeamIOError('Copy operation failed', exceptions)

    def rename(self, source_file_names, destination_file_names):
        exceptions = {}
        for source, destination in zip(source_file_names,
                                       destination_file_names):
            try:
                rel_source = self._parse_url(source)
                rel_destination = self._parse_url(destination)
                if not self._hdfs_client.mv(rel_source, rel_destination):
                    raise BeamIOError('libhdfs error in renaming %s to %s' %
                                      (source, destination))
            except Exception as e:  # pylint: disable=broad-except
                exceptions[(source, destination)] = e

        if exceptions:
            raise BeamIOError('Rename operation failed', exceptions)

    def exists(self, url):
        """Checks existence of url in HDFS.

    Args:
      url: String in the form hdfs://...

    Returns:
      True if url exists as a file or directory in HDFS.
    """
        path = self._parse_url(url)
        return self._exists(path)

    def _exists(self, path):
        """Returns True if path exists as a file or directory in HDFS.

    Args:
      path: String in the form /...
    """
        return self._hdfs_client.exists(path)

    def delete(self, urls):
        exceptions = {}
        for url in urls:
            try:
                path = self._parse_url(url)
                self._hdfs_client.rm(path, recursive=True)
            except Exception as e:  # pylint: disable=broad-except
                exceptions[url] = e

        if exceptions:
            raise BeamIOError("Delete operation failed", exceptions)
Ejemplo n.º 20
0
        return open(path, mode)

    def put(self, src, dst):
        return shutil.copy(src, dst)

if __name__ == "__main__":
    # load the hdfs node info
    f = open('hdfs.yml', 'r')
    data = yaml.load(f)
    f.close()

    hdfs_nn = data['hdfs_nn']
    hdfs = HDFileSystem(host=hdfs_nn, port=data['hdfs_port'])

    tfs = TransparentFileSystem(hdfs)
    print hdfs.exists('/tmp')
    # print hdfs.hoge('/tmp')
    print tfs.exists('/tmp')
    # print tfs.hoge('/tmp')

    # tfs_local = TransparentFileSystem()
    # print tfs_local.glob('/var/tmp')

    print 'test'
    print tfs.glob('/tmp')
    # tfs.hoge()
    tfs_local = TransparentFileSystem()
    # print tfs_local.glob('/home/vagrant/work/data/*')
    # tfs_local.hoge()

    # print tfs.hoge()
Ejemplo n.º 21
0
class HdfsUtils(object):
    def __init__(self, host, port, user):
        self.hdfs = HDFileSystem(host=host, port=port, user=user)

    def hdfs_download(self, hdfs_path, local_path):
        '''
        Download file or dir from hdfs
        :param hdfs_path:
        :param local_path:
        :return:
        '''
        hdfs_path = os.path.normpath(hdfs_path)
        local_path = os.path.normpath(local_path)
        local_parent_path = os.path.dirname(local_path)
        if not self.hdfs.exists(hdfs_path):
            raise Exception('hdfs file not exists: ' + hdfs_path)
        if local_parent_path.strip() and not os.path.exists(local_parent_path):
            raise Exception('local parent folder not exists: ' +
                            local_parent_path)
        if os.path.exists(local_path):
            raise Exception('local file exists: ' + local_path)

        if self.hdfs.isfile(hdfs_path):
            print('is file')
            self.hdfs.get(hdfs_path, local_path)
        elif self.hdfs.isdir(hdfs_path):
            print('is dir')
            os.mkdir(local_path)
            for (root, dirnames, filenames) in self.hdfs.walk(hdfs_path):
                relative_path = os.path.relpath(root, hdfs_path)
                for dirname in dirnames:
                    current_local_dir_path = os.path.join(
                        local_path, relative_path, dirname)
                    os.makedirs(current_local_dir_path)
                for filename in filenames:
                    current_hdfs_file_path = os.path.join(root, filename)
                    current_local_file_path = os.path.join(
                        local_path, relative_path, filename)
                    self.hdfs.get(current_hdfs_file_path,
                                  current_local_file_path)
        else:
            raise Exception('parameters invalid')
        print('Done.')

    def hdfs_upload(self, local_path, hdfs_path):
        '''
        Upload file or dir to hdfs
        :param local_path:
        :param hdfs_path:
        :return:
        '''
        local_path = os.path.normpath(local_path)
        hdfs_path = os.path.normpath(hdfs_path)
        hdfs_parent_path = os.path.dirname(hdfs_path)
        if not os.path.exists(local_path):
            raise Exception('local file not exists: ' + local_path)
        if hdfs_parent_path.strip() and not self.hdfs.exists(hdfs_parent_path):
            raise Exception('hdfs parent folder not exists: ' +
                            hdfs_parent_path)
        if self.hdfs.exists(hdfs_path):
            raise Exception('hdfs file exists: ' + hdfs_path)

        if os.path.isfile(local_path):
            print('is file')
            self.hdfs.put(local_path, hdfs_path)
        elif os.path.isdir(local_path):
            print('is dir')
            self.hdfs.mkdir(hdfs_path)
            for (root, dirnames, filenames) in os.walk(local_path):
                relative_path = os.path.relpath(root, local_path)
                for dirname in dirnames:
                    current_hdfs_dir_path = os.path.join(
                        hdfs_path, relative_path, dirname)
                    self.hdfs.mkdir(current_hdfs_dir_path)
                for filename in filenames:
                    if filename != '.gitignore':
                        current_local_file_path = os.path.join(root, filename)
                        current_hdfs_file_path = os.path.join(
                            hdfs_path, relative_path, filename)
                        self.hdfs.put(current_local_file_path,
                                      current_hdfs_file_path)
        else:
            raise Exception('parameters invalid')
        print('Done.')

    def hdfs_delete(self, hdfs_path):
        '''
        Delete file or dir at hdfs
        :param hdfs_path:
        :param local_path:
        :return:
        '''
        hdfs_path = os.path.normpath(hdfs_path)
        if self.hdfs.exists(hdfs_path):
            self.hdfs.rm(hdfs_path)
        print('Done.')

    def hdfs_mv(self, source_hdfs_path, target_hdfs_path):
        self.hdfs.mv(source_hdfs_path, target_hdfs_path)