Esempio n. 1
0
def getFileSystem(fs="dfs"):
    """
    Returns a Hadoop FileSystem object, either "dfs" (default) or "local".
    """
    if fs == "dfs": return FileSystem.get(happy.getJobConf())
    elif fs == "local": return FileSystem.getLocal(happy.getJobConf())
    else: raise Exception("Unknown filesystem " + fs)
Esempio n. 2
0
  def __init__(self, hdfs_uri, kerberos=False, kerberos_principal=None, keytab_file=None):
    """
    :param hdfs_uri: hdfs://hadoop-name-node:port
    :param kerberos: optional, if kerberos authentication is needed
    :param kerberos_principal: optional, [email protected]
    :param keytab_file: optional, absolute path to keytab file
    """
    self.logger = LoggerFactory.getLogger(self.__class__.__name__)

    self.logger.info("keytab_file: " + keytab_file)

    hdfs_conf = Configuration()
    if hdfs_uri.startswith('hdfs://'):
      hdfs_conf.set(Hdfs.FS_DEFAULT_NAME_KEY, hdfs_uri)
    elif hdfs_uri > "":
      self.logger.error("%s is an invalid uri for hdfs namenode ipc bind." % hdfs_uri)

    if kerberos:  #  init kerberos and keytab
      if not kerberos_principal or not keytab_file or kerberos_principal == '' or keytab_file == '':
        print "Kerberos Principal and Keytab File Name/Path are required!"

      hdfs_conf.set("hadoop.security.authentication", "kerberos")
      hdfs_conf.set("dfs.namenode.kerberos.principal.pattern", "*")
      UserGroupInformation.setConfiguration(hdfs_conf)
      UserGroupInformation.loginUserFromKeytab(kerberos_principal, keytab_file)

    self.fs = Hdfs.get(hdfs_conf)

    requests.packages.urllib3.disable_warnings()
    self.logger.info("Initiated SchemaUrlHelper")
Esempio n. 3
0
  def __init__(self, hdfs_uri, kerberos=False, kerberos_principal=None, keytab_file=None):
    """
    :param hdfs_uri: hdfs://hadoop-name-node:port
    :param kerberos: optional, if kerberos authentication is needed
    :param kerberos_principal: optional, [email protected]
    :param keytab_file: optional, absolute path to keytab file
    """

    self.logger = LoggerFactory.getLogger(self.__class__.__name__)

    self.logger.info("keytab_file: " + keytab_file)

    hdfs_conf = Configuration()
    if hdfs_uri.startswith('hdfs://'):
      hdfs_conf.set(Hdfs.FS_DEFAULT_NAME_KEY, hdfs_uri)
    elif hdfs_uri > "":
      self.logger.error("%s is an invalid uri for hdfs namenode ipc bind." % hdfs_uri)

    if kerberos:  #  init kerberos and keytab
      if not kerberos_principal or not keytab_file or kerberos_principal == '' or keytab_file == '':
        print "Kerberos Principal and Keytab File Name/Path are required!"

      hdfs_conf.set("hadoop.security.authentication", "kerberos")
      hdfs_conf.set("dfs.namenode.kerberos.principal.pattern", "*")
      UserGroupInformation.setConfiguration(hdfs_conf)
      UserGroupInformation.loginUserFromKeytab(kerberos_principal, keytab_file)

    self.fs = Hdfs.get(hdfs_conf)

    requests.packages.urllib3.disable_warnings()
Esempio n. 4
0
    def cp(self, srcfpath, trgfpath):
        "Copy data within the current HDFS."
        try:
            sp = []
            fileList = self.fsHd.globStatus(Path(srcfpath))
            if (fileList is None) or (fileList.__len__() == 0):
                # Emit and error: No files found for srcfPath
                None
            for sfp in fileList:
                sp.append(sfp.getPath())

            sfs = FileSystem.newInstance(self.hdfs.cHdfs)
            tp = Path(trgfpath)
            tfs = FileSystem.newInstance(self.hdfs.cHdfs)
            delSrc = False
            overWrite = True
            self.hdfs.fileUtil.copy(sfs, sp, tfs, tp, delSrc, overWrite, self.hdfs.cHdfs)
        except JException as ex:
            self.logger.error("Exception in HdfsUtil.cp({} -> {}): ex[{}]".format(srcfpath, trgfpath, ex))
    def __init__(self,
                 hdfs_uri,
                 kerberos=False,
                 kerberos_principal=None,
                 keytab_file=None):
        """
    :param hdfs_uri: hdfs://hadoop-name-node:port
    :param kerberos: optional, if kerberos authentication is needed
    :param kerberos_principal: optional, [email protected]
    :param keytab_file: optional, user.keytab or ~/.kerberos/user.keytab
    """

        self.logger = LoggerFactory.getLogger(self.__class__.__name__)

        hdfs_conf = Configuration()
        if hdfs_uri.startswith('hdfs://'):
            hdfs_conf.set(Hdfs.FS_DEFAULT_NAME_KEY, hdfs_uri)
        elif hdfs_uri > "":
            self.logger.error(
                "%s is an invalid uri for hdfs namenode ipc bind." % hdfs_uri)

        if kerberos == True:  #  init kerberos and keytab
            if not kerberos_principal or not keytab_file or kerberos_principal == '' or keytab_file == '':
                print "Kerberos Principal and Keytab File Name/Path are required!"

            keytab_path = keytab_file
            if keytab_file.startswith('/'):
                if os.path.exists(keytab_file):
                    keytab_path = keytab_file
                    print "Using keytab at %s" % keytab_path
            else:  # try relative path
                all_locations = [
                    os.getcwd(),
                    expanduser("~") + "/.ssh",
                    expanduser("~") + "/.kerberos",
                    expanduser("~") + "/.wherehows",
                    os.getenv("APP_HOME"),
                    os.getenv("WH_HOME")
                ]
                for loc in all_locations:
                    if os.path.exists(loc + '/' + keytab_file):
                        keytab_path = loc + '/' + keytab_file
                        print "Using keytab at %s" % keytab_path
                        break

            hdfs_conf.set("hadoop.security.authentication", "kerberos")
            hdfs_conf.set("dfs.namenode.kerberos.principal.pattern", "*")
            UserGroupInformation.setConfiguration(hdfs_conf)
            UserGroupInformation.loginUserFromKeytab(kerberos_principal,
                                                     keytab_path)

        self.fs = Hdfs.get(hdfs_conf)

        requests.packages.urllib3.disable_warnings()
Esempio n. 6
0
    def __init__(self, hdfsCluster):
        self.logger = Logger.getLogger("Hdfs")
        # self.logger.setLevel(Level.DEBUG)

        coreSite = "/etc/hadoop/conf/core-site.xml"
        hdfsSite = "/etc/hadoop/conf/hdfs-site.xml"
        hdfsCluster = hdfsCluster
        self.cHdfs = Configuration()
        self.cHdfs.addResource(Path(coreSite))
        self.cHdfs.addResource(Path(hdfsSite))
        self.cHdfs.set("fs.defaultFS", hdfsCluster)
        self.fileSystem = FileSystem.get(self.cHdfs)
        self.fileUtil = FileUtil()
Esempio n. 7
0
    def cp(self, srcfpath, trgfpath):
        "Copy data within the current HDFS."
        try:
            sp = []
            fileList = self.fsHd.globStatus(Path(srcfpath))
            if ((fileList is None) or (fileList.__len__() == 0)):
                # Emit and error: No files found for srcfPath
                None
            for sfp in fileList:
                sp.append(sfp.getPath())

            sfs = FileSystem.newInstance(self.hdfs.cHdfs)
            tp = Path(trgfpath)
            tfs = FileSystem.newInstance(self.hdfs.cHdfs)
            delSrc = False
            overWrite = True
            self.hdfs.fileUtil.copy(sfs, sp, tfs, tp, delSrc, overWrite,
                                    self.hdfs.cHdfs)
        except JException as ex:
            self.logger.error(
                "Exception in HdfsUtil.cp({} -> {}): ex[{}]".format(
                    srcfpath, trgfpath, ex))
Esempio n. 8
0
    def __init__(self, hdfsCluster):
        self.logger = Logger.getLogger("Hdfs")
        # self.logger.setLevel(Level.DEBUG)

        coreSite = "/etc/hadoop/conf/core-site.xml"
        hdfsSite = "/etc/hadoop/conf/hdfs-site.xml"
        hdfsCluster = hdfsCluster
        self.cHdfs = Configuration()
        self.cHdfs.addResource(Path(coreSite))
        self.cHdfs.addResource(Path(hdfsSite))
        self.cHdfs.set("fs.defaultFS", hdfsCluster)
        self.fileSystem = FileSystem.get(self.cHdfs)
        self.fileUtil = FileUtil()
Esempio n. 9
0
  def __init__(self, hdfs_uri, kerberos=False, kerberos_principal=None, keytab_file=None):
    """
    :param hdfs_uri: hdfs://hadoop-name-node:port
    :param kerberos: optional, if kerberos authentication is needed
    :param kerberos_principal: optional, [email protected]
    :param keytab_file: optional, user.keytab or ~/.kerberos/user.keytab
    """

    self.logger = LoggerFactory.getLogger(self.__class__.__name__)

    hdfs_conf = Configuration()
    if hdfs_uri.startswith('hdfs://'):
      hdfs_conf.set(Hdfs.FS_DEFAULT_NAME_KEY, hdfs_uri)
    elif hdfs_uri > "":
      self.logger.error("%s is an invalid uri for hdfs namenode ipc bind." % hdfs_uri)

    if kerberos == True:  #  init kerberos and keytab
      if not kerberos_principal or not keytab_file or kerberos_principal == '' or keytab_file == '':
        print "Kerberos Principal and Keytab File Name/Path are required!"

      keytab_path = keytab_file
      if keytab_file.startswith('/'):
        if os.path.exists(keytab_file):
          keytab_path = keytab_file
          print "Using keytab at %s" % keytab_path
      else:  # try relative path
        all_locations = [os.getcwd(), expanduser("~") + "/.ssh",
            expanduser("~") + "/.kerberos", expanduser("~") + "/.wherehows",
            os.getenv("APP_HOME"), os.getenv("WH_HOME")]
        for loc in all_locations:
          if os.path.exists(loc + '/' + keytab_file):
            keytab_path = loc + '/' + keytab_file
            print "Using keytab at %s" % keytab_path
            break

      hdfs_conf.set("hadoop.security.authentication", "kerberos")
      hdfs_conf.set("dfs.namenode.kerberos.principal.pattern", "*")
      UserGroupInformation.setConfiguration(hdfs_conf)
      UserGroupInformation.loginUserFromKeytab(kerberos_principal, keytab_path)

    self.fs = Hdfs.get(hdfs_conf)

    requests.packages.urllib3.disable_warnings()
Esempio n. 10
0
from java.io import IOException
from java.util import Properties
from java.lang import SecurityException
from java.lang import System

import sys
import time

startTime = time.time()

if len(sys.argv) != 2:
    raise sys.argv[0] + ' <basedir>'

# Get reference to the Hadoop FileSystem object.  Everything we do in
# this script that interacts with HDFS is through this object.
fs = FileSystem.get(Configuration())

# Make sure the requested collection exists.
collection = sys.argv[1]
collectionDir = Path( collection )

if not fs.exists( collectionDir ):
    print '\nERROR: no collection directory: %s' % collectionDir
    System.exit(1)

# Check for "guard" file.  Like a semaphore, ensures that we don't try
# to update this collection while it's in the middle of being updated.
# Since file creation in HDFS is atomic, we don't check for the existence
# of the guardFile, rather we try to create it.  If the file already exists
# then fs.createNewFile() will return False
guardFile = Path( collectionDir, '_updating' )
Esempio n. 11
0
 def sameSize(self, local, dest):
     fs1 = FileSystem.getLocal(self.configuration)
     fs2 = dest.getFileSystem(self.configuration)
     return fs1.getFileStatus(local).getLen() == fs2.getFileStatus(dest).getLen()
Esempio n. 12
0
from java.io import IOException
from java.util import Properties
from java.lang import SecurityException
from java.lang import System

import sys
import time

startTime = time.time()

if len(sys.argv) != 2:
    raise sys.argv[0] + ' <basedir>'

# Get reference to the Hadoop FileSystem object.  Everything we do in
# this script that interacts with HDFS is through this object.
fs = FileSystem.get(Configuration())

# Make sure the requested collection exists.
collection = sys.argv[1]
collectionDir = Path(collection)

if not fs.exists(collectionDir):
    print '\nERROR: no collection directory: %s' % collectionDir
    System.exit(1)

# Check for "guard" file.  Like a semaphore, ensures that we don't try
# to update this collection while it's in the middle of being updated.
# Since file creation in HDFS is atomic, we don't check for the existence
# of the guardFile, rather we try to create it.  If the file already exists
# then fs.createNewFile() will return False
guardFile = Path(collectionDir, '_updating')
Esempio n. 13
0
 def sameSize(self, local, dest):
     fs1 = FileSystem.getLocal(self.configuration)
     fs2 = dest.getFileSystem(self.configuration)
     return fs1.getFileStatus(local).getLen() == fs2.getFileStatus(
         dest).getLen()