Ejemplo n.º 1
0
 def process():
     for key in urlMaping.keys():
         client = KerberosClient(urlMaping[key], root=root, proxy=proxy)
         try:
             client.list("/")
             return client
         except:
             continue
Ejemplo n.º 2
0
        def testip(ip, root=None, proxy=None):
            print ip

            if ip == '':
                return process()
            else:
                client = KerberosClient(urlMaping[ip], root=root, proxy=proxy)
                try:
                    print 'test %s' % urlMaping[ip]
                    client.list("/")
                    return client
                except:

                    return process()
Ejemplo n.º 3
0
    def get_model(self):
        client = KerberosClient(settings.DUNANT_HDFS_PATH)

        MODEL_DIR = settings.DUNANT_MODEL_DIR
        MOST_RECENT_MODEL = sorted(client.list(MODEL_DIR))[-1]

        MODEL_PARAMETERS_PATH = f'{MODEL_DIR}/{MOST_RECENT_MODEL}/model'
        MLB_PATH = f'{MODEL_PARAMETERS_PATH}/mlb_binarizer.pkl'
        VECTORIZER_PATH = f'{MODEL_PARAMETERS_PATH}/vectorizer.pkl'
        CLASSIFIER_PATH = f'{MODEL_PARAMETERS_PATH}/model.pkl'

        # For pickle to be able to unpickle, the class must be present in the
        # same import structure as when it was pickled.
        # Manually setting sys.modules to mimic the expected import structure
        sys.modules['models'] = classifiers

        # Latin1 encoding required to convert Python2 pickle to Python3
        with client.read(MLB_PATH) as r:
            mlb = pickle.loads(r.read(), encoding="latin1")
        with client.read(VECTORIZER_PATH) as r:
            vectorizer = pickle.loads(r.read(), encoding="latin1")
        with client.read(CLASSIFIER_PATH) as r:
            clf = pickle.loads(r.read(), encoding="latin1")
        del sys.modules['models']

        return mlb, vectorizer, clf
Ejemplo n.º 4
0
def hdfs_connect_demo():

    # NOTE 底层会调用 kinit
    with krbContext(using_keytab=True,
                    principal='*****@*****.**',
                    keytab_file='/houleilei.client.keytab'):
        client = KerberosClient('http://hadoop01.stor:50070',
                                hostname_override='hadoop01.stor')
        # client = InsecureClient('http://hadoop01.stor:50070', user='******')
        result = client.list('/home/holyzing/')
        print(type(result), result)
Ejemplo n.º 5
0
    def get_client(self, block_params=None, connection_params=None):
        try:
            kerb_auth = False
            method = "https"

            if "https" in connection_params:
                if connection_params["https"]:
                    method = "https"
                else:
                    method = "http"

            host_name = connection_params["hostName"]
            port = connection_params["port"]

            if 'kerberos' in connection_params:
                kerb_auth = bool(connection_params['kerberos'])

            if kerb_auth:
                principal = generate_ticket_granting_ticket(
                    block_params, connection_params["authName"])
                session = requests.Session()
                session.verify = False
                full_host = "%s://%s:%s" % (method, host_name, port)
                client = KerberosClient(url=full_host,
                                        session=session,
                                        mutual_auth='OPTIONAL',
                                        principal=principal)
                client.list('/')
                return client
            else:
                hadoop_host = host_name + ":" + port
                client = InsecureClient("http://" + hadoop_host)
                client.list('/')
                return client
        except Exception as e:
            self.logger.error(
                "Error Occurred While Connecting to HDFS With Given Connection Details"
            )
            raise e
Ejemplo n.º 6
0
 def __init__(self,
              hdfs_urls,
              path_hdfs='./',
              max_file_size=MAX_FILE_SIZE,
              max_process=4,
              log_level='INFO'):
     """
     :param hdfs_url list[str]: hdfs url (ex: ['X'])
     :param path_hdfs str: path to write file in HDFS
     :param max_file_size int: limit size before create a new file and save the current file to hdfs (compressed)
     :param max_process int: number of subprocess to compress and write file in HDFS (max_process > 0)
     :param log_level str: logger level
     """
     # Config logger
     formatter = logging.Formatter(
         "%(asctime)s - %(name)s - %(levelname)s - %(message)s")
     stream_handler = logging.StreamHandler()
     stream_handler.setFormatter(formatter)
     self.logger = logging.getLogger('WriteHdfs')
     self.logger.addHandler(stream_handler)
     self.logger.setLevel(log_level)
     # Config signal exit
     signal.signal(signal.SIGINT, self.__signal_handler)
     # Try to find the active namenode in the list
     for hdfs_url in hdfs_urls:
         try:
             hdfs_client = KerberosClient(hdfs_url)
             hdfs_client.list(path_hdfs)
             self.hdfs_url = hdfs_url
             self.logger.info('identify namenode: %s' % hdfs_url)
             break
         except hdfs.util.HdfsError:
             continue
     self.path_hdfs = path_hdfs
     self.max_process = max_process
     # Files settings
     self.file_size = 0
     self.file_name = self.__generate_file_name()
     self.max_file_size = max_file_size
Ejemplo n.º 7
0
HDFS_USER = options['hdfs_user']
HDFS_MODEL_DIR = options['hdfs_model_dir']
EVALUATE_SAVE_GSPREAD = options['evaluate_save_gspread']
FORMATTED_HDFS_PATH = "/".join(HDFS_MODEL_DIR.split('/')[5:])

print('Running Evaluate script:')
print('Connecting to HDFS and Oracle database...')
client = KerberosClient(HDFS_URL)

conn = jdbc.connect("oracle.jdbc.driver.OracleDriver", URL_ORACLE_SERVER,
                    [USER_ORACLE, PASSWD_ORACLE], ORACLE_DRIVER_PATH)
curs = conn.cursor()

MOTIVOS_DICT = get_motivos_declarados(curs)

model_dates = sorted(client.list(FORMATTED_HDFS_PATH))
validated_datasets = []
classified_datasets = []

for model_date in model_dates:
    try:
        data_hdfs = get_results_from_hdfs(client,
                                          FORMATTED_HDFS_PATH,
                                          model_date=model_date)
    except BaseException:
        continue
    # Results are stored as a tuple represented as a string
    data_hdfs['MDEC_DK'] = data_hdfs['MDEC_DK'].apply(
        lambda x: ast.literal_eval(x))

    keys = get_keys(data_hdfs, 'SNCA_DK')
Ejemplo n.º 8
0
                principal=config['kerberos_principal'],
                keytab_file=config['keytab_file'],
                ccache_file=config['kerberos_cache_file']):
    # hive.Connection()
    con = hive.connect(host='uatnd02.csdntest.com.local',
                       port=10000,
                       auth='KERBEROS',
                       kerberos_service_name="hive"
                       )  # host为hiveserver2所在节点,port默认10000,为hs2的端口
    cursor = con.cursor()
    cursor.execute('select * from dl_nccp.account limit 5')  # 不能有分号!
    # cursor.execute('desc dl_nccp.account') #不能有分号!
    datas = cursor.fetchall()
    print(datas)
    cursor.close()
    con.close()

    conn = dbapi.connect(host='uatnd02.csdntest.com.local',
                         port=10000,
                         auth_mechanism='GSSAPI',
                         kerberos_service_name="hive")
    cursor = conn.cursor()

    # hdfs kerberos
    client = KerberosClient('http://hdfs_ip:50070', hostname_override="hdfs域名")
    client._list_status()
    client.list()
    client.delete()
    client.upload()
    client.download()
    client.makedirs('test')
Ejemplo n.º 9
0
from hdfs.ext.kerberos import KerberosClient

if __name__ == "__main__":
    client = KerberosClient("http://10.214.208.11:9000")
    client.list("/")
    pass
Ejemplo n.º 10
0
def execute_process(args):

    app_name = "criar_tabela_tce"
    spark = pyspark.sql.session.SparkSession \
        .builder \
        .appName(app_name) \
        .config("hive.exec.dynamic.partition.mode", "nonstrict") \
        .enableHiveSupport() \
        .getOrCreate()

    client = KerberosClient(args.webHdfs)

    hdfs_files = client.list(args.pathDirectoryBase)

    for directory in hdfs_files:

        try:

            actual_directory = args.pathDirectoryBase + directory

            df = spark.read.text(actual_directory)

            if not df.rdd.isEmpty():

                # df = spark.read.load(actual_directory, format="csv", multiLine=True,
                #                     sep=args.delimiter, inferSchema=True, header=True)

                columns_types = params_table.table_columns_type[directory]

                df = spark.read.option("quote", "\"") \
                    .option("escape", "\"") \
                    .load(actual_directory, format="csv", sep=args.delimiter, header=True)

                columns = [
                    trait_columns_name(column_name)
                    for column_name in df.columns
                ]

                df = df.toDF(*columns)

                df = reduce(check_type, columns_types, df)

                #df = reduce(remove_break_lines, df.dtypes, df)

                table_hive = "{}.{}".format(args.schemaHive, directory)

                table_postgres = "{}.{}".format(args.schemaPostgres, directory)

                df.write.mode("overwrite").format("parquet").saveAsTable(
                    table_hive)

                spark.sql(
                    "ANALYZE TABLE {} COMPUTE STATISTICS".format(table_hive))

                execute_compute_stats(table_hive)

                export_to_postgres(df, args, table_postgres)

                send_log(SUCCESS_MESSAGE.format(directory), app_name, SUCCESS,
                         args.solrServer, args.source)

        except Exception as message:
            send_log(ERROR_MESSAGE.format(directory, message), app_name, ERROR,
                     args.solrServer, args.source)
Ejemplo n.º 11
0
df = df.groupby([ID_COLUMN, TEXT_COLUMN])\
       .agg(lambda x: set(x))\
       .reset_index()

nb_new_documents = len(df)
if nb_new_documents == 0:
    print('No new data to predict!')
    sys.exit()
else:
    print('{} new documents to predict.\n'.format(nb_new_documents))

X = np.array(df[TEXT_COLUMN])

print('Loading models...')
formatted_hdfs_path = "/".join(HDFS_MODEL_DIR.split('/')[5:])
most_recent_date = sorted(client.list(formatted_hdfs_path))[-1]
with client.read('{}/{}/model/mlb_binarizer.pkl'.format(
        formatted_hdfs_path, most_recent_date)) as mlb_reader:
    mlb = pickle.loads(mlb_reader.read())
with client.read('{}/{}/model/vectorizer.pkl'.format(
        formatted_hdfs_path, most_recent_date)) as vectorizer_reader:
    vectorizer = pickle.loads(vectorizer_reader.read())
with client.read('{}/{}/model/model.pkl'.format(
        formatted_hdfs_path, most_recent_date)) as clf_reader:
    clf = pickle.loads(clf_reader.read())

print('Predicting...')
reg_clf = RegexClassifier(RULES)
y_regex = reg_clf.predict(X)
y_regex = mlb.transform(y_regex)
Ejemplo n.º 12
0
class OperateHDFS:
    def __init__(self, url):
        '''

        :param url:HDFS名称节点的主机名或IP地址,以协议为前缀,其次是namenode上的WebHDFS端口,也可以指定多个URL以分号分隔以获取高可用性支持.
        '''
        # 实例化HDFS web client using Kerberos authentication
        self.client = KerberosClient(url)

    def file_list(self, file_path):
        '''

        :param file_path: HDFS远程目录路径
        :return: 返回一个远程目录中包含的所有文件
        '''
        file_detail = self.client.list(hdfs_path=file_path)
        return file_detail

    def file_read(self, file_path):
        '''
        从HDFS中读取文件
        :param file_path: HDFS远程文件路径
        :return:
        '''
        lines = []
        with self.client.read(hdfs_path=file_path,
                              encoding='utf-8',
                              delimiter=r'\n') as reader:
            # content = file.read()
            # print(content)
            for item in reader:
                lines.append(item.strip())
        return lines

    def file_create_write(self, file_path, data_write):
        '''
        在HDFS中创建新文件并写入内容
        :param file_path: HDFS远程文件路径
        :param data_write: 写入到文件的数据
        :return:
        '''
        self.client.write(hdfs_path=file_path,
                          data=data_write,
                          encoding='utf-8')

    def file_append_write(self, file_path, data_append):
        '''
        在HDFS中已存在的文件中追加写入内容,文件必须已存在
        :param file_path: HDFS远程文件路径
        :param data_append: 追加到文件的数据
        :return:
        '''
        self.client.write(hdfs_path=file_path,
                          data=data_append,
                          encoding='utf-8',
                          append=True)

    def file_rename(self, src_file_path, dst_file_path):
        '''
        重命名/移动文件或文件夹
        :param src_file_path: 源文件路径
        :param dst_file_path: 目的文件路径
        :return:
        '''
        self.client.rename(hdfs_src_path=src_file_path,
                           hdfs_dst_path=dst_file_path)

    def mkdir(self, file_path):
        '''
        在HDFS中创建远程目录,必要时递归创建
        :param file_path: 需要新建的文件夹路径(包含名字)
        :return:
        '''
        self.client.makedirs(hdfs_path=file_path)

    def upload_files(self, file_path, local_path):
        '''
        上传文件或目录到HDFS
        :param file_path:HDFS目标路径。如果它已经存在并且是一个目录,文件将被上传其中。
        :param local_path:文件或文件夹的本地路径。 如果是文件夹,则将上传其中的所有文件(请注意,这意味着没有文件的文件夹将不会远程创建)
        :return:hdfs_path_return:成功后,此方法将返回远程上传路径。
        '''
        hdfs_path_return = self.client.upload(hdfs_path=file_path,
                                              local_path=local_path)
        return hdfs_path_return

    def download_files(self, file_path, local_path):
        '''
        从HDFS下载一个文件或文件夹并将其保存在本地
        :param file_path:HDFS上要下载的文件或文件夹的路径。 如果是文件夹,则将下载该文件夹下的所有文件
        :param local_path:本地路径。 如果它已经存在并且是目录,则文件将在其中下载。
        :return: local_path_return:成功后,此方法将返回本地下载路径
        '''
        local_path_return = self.client.download(hdfs_path=file_path,
                                                 local_path=local_path)
        return local_path_return

    def delete_files(self, file_path):
        '''
        从HDFS中删除文件或目录
        :param file_path: HDFS中需要删除的文件或目录的路径
        :return:如果删除成功,则此函数返回“ True”,如果先前在“ hdfs_path”处不存在文件或目录,则返回“ False”。
        '''
        # recursive:递归删除文件和目录。 默认情况下,如果尝试删除非空目录,则此方法将引发HdfsError。
        # skip_trash:设置为false时,已删除的路径将被移至相应的垃圾文件夹,而不是被删除。 这需要Hadoop 2.9+且在集群上启用trash
        return self.client.delete(hdfs_path=file_path,
                                  recursive=False,
                                  skip_trash=True)

    def set_files_permission(self, file_path):
        '''
        更改文件的权限
        :param file_path: 需要更改权限的文件路径
        :return:
        '''
        # permission:文件的新八进制权限字符串
        self.client.set_permission(hdfs_path=file_path, permission=None)
Ejemplo n.º 13
0
import os
from requests_kerberos import HTTPKerberosAuth
from hdfs.ext.kerberos import KerberosClient

url = "http://alderamin.sdab.sn:50070;http://fomalgaut.sdab.sn:50070"
os.environ[
    "KRB5_CLIENT_KTNAME"] = "/home/doopy/pyprojects/csp-ba-bas_logs_mapreduce/doopy.keytab"
kerberos_auth = HTTPKerberosAuth(principal="*****@*****.**")

client = KerberosClient(url)
print(client.list('/tmp/'))
Ejemplo n.º 14
0
"""
*******************
*Copyright 2017, MapleLabs, All Rights Reserved.
*
********************
"""

import sys
from hdfs.ext.kerberos import KerberosClient
from hdfs.client import InsecureClient
from requests import Session
from requests_kerberos import HTTPKerberosAuth, DISABLED

session = Session()
session.verify = False
kerberos_auth = HTTPKerberosAuth(mutual_authentication=DISABLED, force_preemptive=True, principal='')
session.auth = kerberos_auth
client = KerberosClient("", session=session)
#client = InsecureClient("", session=session)
file = sys.argv[1]
destfile = sys.argv[2]

print client.list('/mr-history/done')

client.download(file, destfile, overwrite=True)

Ejemplo n.º 15
0
from hdfs.ext.kerberos import KerberosClient

client = KerberosClient('http://X:50070')
# Listing all files inside a directory.
fnames = client.list('.')
print(fnames)