Esempio n. 1
0
    def __init__(self, hdfsCluster):
        self.logger = Logger.getLogger("Hdfs")
        # self.logger.setLevel(Level.DEBUG)

        coreSite = "/etc/hadoop/conf/core-site.xml"
        hdfsSite = "/etc/hadoop/conf/hdfs-site.xml"
        hdfsCluster = hdfsCluster
        self.cHdfs = Configuration()
        self.cHdfs.addResource(Path(coreSite))
        self.cHdfs.addResource(Path(hdfsSite))
        self.cHdfs.set("fs.defaultFS", hdfsCluster)
        self.fileSystem = FileSystem.get(self.cHdfs)
        self.fileUtil = FileUtil()
Esempio n. 2
0
  def __init__(self, hdfs_uri, kerberos=False, kerberos_principal=None, keytab_file=None):
    """
    :param hdfs_uri: hdfs://hadoop-name-node:port
    :param kerberos: optional, if kerberos authentication is needed
    :param kerberos_principal: optional, [email protected]
    :param keytab_file: optional, absolute path to keytab file
    """

    self.logger = LoggerFactory.getLogger(self.__class__.__name__)

    self.logger.info("keytab_file: " + keytab_file)

    hdfs_conf = Configuration()
    if hdfs_uri.startswith('hdfs://'):
      hdfs_conf.set(Hdfs.FS_DEFAULT_NAME_KEY, hdfs_uri)
    elif hdfs_uri > "":
      self.logger.error("%s is an invalid uri for hdfs namenode ipc bind." % hdfs_uri)

    if kerberos:  #  init kerberos and keytab
      if not kerberos_principal or not keytab_file or kerberos_principal == '' or keytab_file == '':
        print "Kerberos Principal and Keytab File Name/Path are required!"

      hdfs_conf.set("hadoop.security.authentication", "kerberos")
      hdfs_conf.set("dfs.namenode.kerberos.principal.pattern", "*")
      UserGroupInformation.setConfiguration(hdfs_conf)
      UserGroupInformation.loginUserFromKeytab(kerberos_principal, keytab_file)

    self.fs = Hdfs.get(hdfs_conf)

    requests.packages.urllib3.disable_warnings()
Esempio n. 3
0
class Hdfs:
    def __init__(self, hdfsCluster):
        self.logger = Logger.getLogger("Hdfs")
        # self.logger.setLevel(Level.DEBUG)

        coreSite = "/etc/hadoop/conf/core-site.xml"
        hdfsSite = "/etc/hadoop/conf/hdfs-site.xml"
        hdfsCluster = hdfsCluster
        self.cHdfs = Configuration()
        self.cHdfs.addResource(Path(coreSite))
        self.cHdfs.addResource(Path(hdfsSite))
        self.cHdfs.set("fs.defaultFS", hdfsCluster)
        self.fileSystem = FileSystem.get(self.cHdfs)
        self.fileUtil = FileUtil()
Esempio n. 4
0
 def __init__(self):
     '''
     Init hadoop filesystem objects
     '''
     logger.debug("Initiate hadoop.fs.FileSystem object")
     self.config = Configuration()
     self.file_system = fs.FileSystem.get(self.config)
Esempio n. 5
0
  def __init__(self, hdfs_uri, kerberos=False, kerberos_principal=None, keytab_file=None):
    """
    :param hdfs_uri: hdfs://hadoop-name-node:port
    :param kerberos: optional, if kerberos authentication is needed
    :param kerberos_principal: optional, [email protected]
    :param keytab_file: optional, absolute path to keytab file
    """
    self.logger = LoggerFactory.getLogger(self.__class__.__name__)

    self.logger.info("keytab_file: " + keytab_file)

    hdfs_conf = Configuration()
    if hdfs_uri.startswith('hdfs://'):
      hdfs_conf.set(Hdfs.FS_DEFAULT_NAME_KEY, hdfs_uri)
    elif hdfs_uri > "":
      self.logger.error("%s is an invalid uri for hdfs namenode ipc bind." % hdfs_uri)

    if kerberos:  #  init kerberos and keytab
      if not kerberos_principal or not keytab_file or kerberos_principal == '' or keytab_file == '':
        print "Kerberos Principal and Keytab File Name/Path are required!"

      hdfs_conf.set("hadoop.security.authentication", "kerberos")
      hdfs_conf.set("dfs.namenode.kerberos.principal.pattern", "*")
      UserGroupInformation.setConfiguration(hdfs_conf)
      UserGroupInformation.loginUserFromKeytab(kerberos_principal, keytab_file)

    self.fs = Hdfs.get(hdfs_conf)

    requests.packages.urllib3.disable_warnings()
    self.logger.info("Initiated SchemaUrlHelper")
Esempio n. 6
0
 def hdfs_folder_exists(self, folder):
     path = Path(folder)
     fs = path.getFileSystem(Configuration())
     try:
         status = fs.getFileStatus(path)
         # TODO: there could be problems if it exists but is a simple file
         return status.isDir()
     except:
         return False
    def __init__(self,
                 hdfs_uri,
                 kerberos=False,
                 kerberos_principal=None,
                 keytab_file=None):
        """
    :param hdfs_uri: hdfs://hadoop-name-node:port
    :param kerberos: optional, if kerberos authentication is needed
    :param kerberos_principal: optional, [email protected]
    :param keytab_file: optional, user.keytab or ~/.kerberos/user.keytab
    """

        self.logger = LoggerFactory.getLogger(self.__class__.__name__)

        hdfs_conf = Configuration()
        if hdfs_uri.startswith('hdfs://'):
            hdfs_conf.set(Hdfs.FS_DEFAULT_NAME_KEY, hdfs_uri)
        elif hdfs_uri > "":
            self.logger.error(
                "%s is an invalid uri for hdfs namenode ipc bind." % hdfs_uri)

        if kerberos == True:  #  init kerberos and keytab
            if not kerberos_principal or not keytab_file or kerberos_principal == '' or keytab_file == '':
                print "Kerberos Principal and Keytab File Name/Path are required!"

            keytab_path = keytab_file
            if keytab_file.startswith('/'):
                if os.path.exists(keytab_file):
                    keytab_path = keytab_file
                    print "Using keytab at %s" % keytab_path
            else:  # try relative path
                all_locations = [
                    os.getcwd(),
                    expanduser("~") + "/.ssh",
                    expanduser("~") + "/.kerberos",
                    expanduser("~") + "/.wherehows",
                    os.getenv("APP_HOME"),
                    os.getenv("WH_HOME")
                ]
                for loc in all_locations:
                    if os.path.exists(loc + '/' + keytab_file):
                        keytab_path = loc + '/' + keytab_file
                        print "Using keytab at %s" % keytab_path
                        break

            hdfs_conf.set("hadoop.security.authentication", "kerberos")
            hdfs_conf.set("dfs.namenode.kerberos.principal.pattern", "*")
            UserGroupInformation.setConfiguration(hdfs_conf)
            UserGroupInformation.loginUserFromKeytab(kerberos_principal,
                                                     keytab_path)

        self.fs = Hdfs.get(hdfs_conf)

        requests.packages.urllib3.disable_warnings()
Esempio n. 8
0
def expand_path_with_home(output_folder):
    """Prepend the home folder to a relative location on HDFS if necessary.
    
    If we specified a relative path, prepend it with the home folder
    of the user on HDFS. If we are running in local mode, don't do anything.
    
    Arguments:
    output_folder -- the absolute or relative path of the output HDFS folder
    """
    import pycascading.pipe
    if pycascading.pipe.running_mode == 'hadoop':
        if output_folder == '' or \
        (output_folder[0 : 5] not in set(['hdfs:', 'file:']) and \
         output_folder[0] != '/'):
            fs = Path('/').getFileSystem(Configuration())
            home_folder = fs.getHomeDirectory().toString()
            return home_folder + '/' + output_folder
    return output_folder
Esempio n. 9
0
def expand_path_with_home(output_folder):
    """Prepend the home folder to a relative location on HDFS if necessary.

    Only if we specified a relative path and no scheme, prepend it with the
    home folder of the user on HDFS. This behavior is similar to how
    "hadoop fs" works. If we are running in local mode, don't do anything.

    Arguments:
    output_folder -- the absolute or relative path of the output HDFS folder
    """
    import pycascading.pipe
    if pycascading.pipe.config['pycascading.running_mode'] == 'hadoop':
        if not any(map(lambda scheme: output_folder.startswith(scheme), \
                       ['hdfs:', 'file:', 's3:', 's3n:', '/'])):
            fs = Path('/').getFileSystem(Configuration())
            home_folder = fs.getHomeDirectory().toString()
            return home_folder + '/' + output_folder
    return output_folder
Esempio n. 10
0
  def __init__(self, hdfs_uri, kerberos=False, kerberos_principal=None, keytab_file=None):
    """
    :param hdfs_uri: hdfs://hadoop-name-node:port
    :param kerberos: optional, if kerberos authentication is needed
    :param kerberos_principal: optional, [email protected]
    :param keytab_file: optional, user.keytab or ~/.kerberos/user.keytab
    """

    self.logger = LoggerFactory.getLogger(self.__class__.__name__)

    hdfs_conf = Configuration()
    if hdfs_uri.startswith('hdfs://'):
      hdfs_conf.set(Hdfs.FS_DEFAULT_NAME_KEY, hdfs_uri)
    elif hdfs_uri > "":
      self.logger.error("%s is an invalid uri for hdfs namenode ipc bind." % hdfs_uri)

    if kerberos == True:  #  init kerberos and keytab
      if not kerberos_principal or not keytab_file or kerberos_principal == '' or keytab_file == '':
        print "Kerberos Principal and Keytab File Name/Path are required!"

      keytab_path = keytab_file
      if keytab_file.startswith('/'):
        if os.path.exists(keytab_file):
          keytab_path = keytab_file
          print "Using keytab at %s" % keytab_path
      else:  # try relative path
        all_locations = [os.getcwd(), expanduser("~") + "/.ssh",
            expanduser("~") + "/.kerberos", expanduser("~") + "/.wherehows",
            os.getenv("APP_HOME"), os.getenv("WH_HOME")]
        for loc in all_locations:
          if os.path.exists(loc + '/' + keytab_file):
            keytab_path = loc + '/' + keytab_file
            print "Using keytab at %s" % keytab_path
            break

      hdfs_conf.set("hadoop.security.authentication", "kerberos")
      hdfs_conf.set("dfs.namenode.kerberos.principal.pattern", "*")
      UserGroupInformation.setConfiguration(hdfs_conf)
      UserGroupInformation.loginUserFromKeytab(kerberos_principal, keytab_path)

    self.fs = Hdfs.get(hdfs_conf)

    requests.packages.urllib3.disable_warnings()
Esempio n. 11
0
 def __init__(self):
     '''
     Constructor
     '''
     self.config = Configuration()
     self.file_system = fs.FileSystem.get(self.config)
Esempio n. 12
0
from java.io import IOException
from java.util import Properties
from java.lang import SecurityException
from java.lang import System

import sys
import time

startTime = time.time()

if len(sys.argv) != 2:
    raise sys.argv[0] + ' <basedir>'

# Get reference to the Hadoop FileSystem object.  Everything we do in
# this script that interacts with HDFS is through this object.
fs = FileSystem.get(Configuration())

# Make sure the requested collection exists.
collection = sys.argv[1]
collectionDir = Path(collection)

if not fs.exists(collectionDir):
    print '\nERROR: no collection directory: %s' % collectionDir
    System.exit(1)

# Check for "guard" file.  Like a semaphore, ensures that we don't try
# to update this collection while it's in the middle of being updated.
# Since file creation in HDFS is atomic, we don't check for the existence
# of the guardFile, rather we try to create it.  If the file already exists
# then fs.createNewFile() will return False
guardFile = Path(collectionDir, '_updating')