Ejemplo n.º 1
0
    def get_model(self):
        client = KerberosClient(settings.DUNANT_HDFS_PATH)

        MODEL_DIR = settings.DUNANT_MODEL_DIR
        MOST_RECENT_MODEL = sorted(client.list(MODEL_DIR))[-1]

        MODEL_PARAMETERS_PATH = f'{MODEL_DIR}/{MOST_RECENT_MODEL}/model'
        MLB_PATH = f'{MODEL_PARAMETERS_PATH}/mlb_binarizer.pkl'
        VECTORIZER_PATH = f'{MODEL_PARAMETERS_PATH}/vectorizer.pkl'
        CLASSIFIER_PATH = f'{MODEL_PARAMETERS_PATH}/model.pkl'

        # For pickle to be able to unpickle, the class must be present in the
        # same import structure as when it was pickled.
        # Manually setting sys.modules to mimic the expected import structure
        sys.modules['models'] = classifiers

        # Latin1 encoding required to convert Python2 pickle to Python3
        with client.read(MLB_PATH) as r:
            mlb = pickle.loads(r.read(), encoding="latin1")
        with client.read(VECTORIZER_PATH) as r:
            vectorizer = pickle.loads(r.read(), encoding="latin1")
        with client.read(CLASSIFIER_PATH) as r:
            clf = pickle.loads(r.read(), encoding="latin1")
        del sys.modules['models']

        return mlb, vectorizer, clf
Ejemplo n.º 2
0
    client = KerberosClient(hdfs_url)

    s3 = session.client('s3',use_ssl=False, verify=False)
    counter = 0

    for file_path in file_list_arr:

        file_path = source_directory + file_path

        status = client.status(file_path, strict=False)

        if bool(status):
            file_name = os.path.basename(file_path)
            key_name = s3_folder_name + file_name

            with client.read(file_path) as f:
                s3.upload_fileobj(f, bucket_name, key_name)

            client.delete(file_path, recursive=False, skip_trash=True)
            counter = counter + 1
            py_logger.info("File: " + file_path + " moved to s3 bucket")
        
    py_logger.info("S3 script execution completed. No.of Files moved: " + str(counter))

	#Compresses the log files which are greater than 30 days
    today = date.today()
    current_day = datetime.now().strftime('%d')
    log_directory = log_file_path.rpartition('/')[0] + log_file_path.rpartition('/')[1]
    tarFileName = log_directory + today.strftime("%d-%m-%Y") + '.tar.gz'
	
    if current_day == "30":
Ejemplo n.º 3
0
import sys
from hdfs.ext.kerberos import KerberosClient

client = KerberosClient(sys.argv[1])

with client.read(sys.argv[2]) as reader:
    test = reader.read()

print(test)
Ejemplo n.º 4
0
# encoding: utf-8
"""

@Author: wanghuagang
@Contact: [email protected]
@Project: StudyPython
@File:  d1
@Date: 2019/7/15 上午10:27
@Description:

"""

from hdfs.ext.kerberos import KerberosClient

client = KerberosClient('http://m1.node.hadoop:50070')

with client.read('/tmp/fastparquet/user1.parquet') as reader:
    features = reader.read()
    print(features)
Ejemplo n.º 5
0
       .agg(lambda x: set(x))\
       .reset_index()

nb_new_documents = len(df)
if nb_new_documents == 0:
    print('No new data to predict!')
    sys.exit()
else:
    print('{} new documents to predict.\n'.format(nb_new_documents))

X = np.array(df[TEXT_COLUMN])

print('Loading models...')
formatted_hdfs_path = "/".join(HDFS_MODEL_DIR.split('/')[5:])
most_recent_date = sorted(client.list(formatted_hdfs_path))[-1]
with client.read('{}/{}/model/mlb_binarizer.pkl'.format(
        formatted_hdfs_path, most_recent_date)) as mlb_reader:
    mlb = pickle.loads(mlb_reader.read())
with client.read('{}/{}/model/vectorizer.pkl'.format(
        formatted_hdfs_path, most_recent_date)) as vectorizer_reader:
    vectorizer = pickle.loads(vectorizer_reader.read())
with client.read('{}/{}/model/model.pkl'.format(
        formatted_hdfs_path, most_recent_date)) as clf_reader:
    clf = pickle.loads(clf_reader.read())

print('Predicting...')
reg_clf = RegexClassifier(RULES)
y_regex = reg_clf.predict(X)
y_regex = mlb.transform(y_regex)

X = vectorizer.transform(X)
y_logreg = clf.predict(X, threshold=THRESHOLD, max_classes=MAX_CLASSES)
Ejemplo n.º 6
0
class OperateHDFS:
    def __init__(self, url):
        '''

        :param url:HDFS名称节点的主机名或IP地址,以协议为前缀,其次是namenode上的WebHDFS端口,也可以指定多个URL以分号分隔以获取高可用性支持.
        '''
        # 实例化HDFS web client using Kerberos authentication
        self.client = KerberosClient(url)

    def file_list(self, file_path):
        '''

        :param file_path: HDFS远程目录路径
        :return: 返回一个远程目录中包含的所有文件
        '''
        file_detail = self.client.list(hdfs_path=file_path)
        return file_detail

    def file_read(self, file_path):
        '''
        从HDFS中读取文件
        :param file_path: HDFS远程文件路径
        :return:
        '''
        lines = []
        with self.client.read(hdfs_path=file_path,
                              encoding='utf-8',
                              delimiter=r'\n') as reader:
            # content = file.read()
            # print(content)
            for item in reader:
                lines.append(item.strip())
        return lines

    def file_create_write(self, file_path, data_write):
        '''
        在HDFS中创建新文件并写入内容
        :param file_path: HDFS远程文件路径
        :param data_write: 写入到文件的数据
        :return:
        '''
        self.client.write(hdfs_path=file_path,
                          data=data_write,
                          encoding='utf-8')

    def file_append_write(self, file_path, data_append):
        '''
        在HDFS中已存在的文件中追加写入内容,文件必须已存在
        :param file_path: HDFS远程文件路径
        :param data_append: 追加到文件的数据
        :return:
        '''
        self.client.write(hdfs_path=file_path,
                          data=data_append,
                          encoding='utf-8',
                          append=True)

    def file_rename(self, src_file_path, dst_file_path):
        '''
        重命名/移动文件或文件夹
        :param src_file_path: 源文件路径
        :param dst_file_path: 目的文件路径
        :return:
        '''
        self.client.rename(hdfs_src_path=src_file_path,
                           hdfs_dst_path=dst_file_path)

    def mkdir(self, file_path):
        '''
        在HDFS中创建远程目录,必要时递归创建
        :param file_path: 需要新建的文件夹路径(包含名字)
        :return:
        '''
        self.client.makedirs(hdfs_path=file_path)

    def upload_files(self, file_path, local_path):
        '''
        上传文件或目录到HDFS
        :param file_path:HDFS目标路径。如果它已经存在并且是一个目录,文件将被上传其中。
        :param local_path:文件或文件夹的本地路径。 如果是文件夹,则将上传其中的所有文件(请注意,这意味着没有文件的文件夹将不会远程创建)
        :return:hdfs_path_return:成功后,此方法将返回远程上传路径。
        '''
        hdfs_path_return = self.client.upload(hdfs_path=file_path,
                                              local_path=local_path)
        return hdfs_path_return

    def download_files(self, file_path, local_path):
        '''
        从HDFS下载一个文件或文件夹并将其保存在本地
        :param file_path:HDFS上要下载的文件或文件夹的路径。 如果是文件夹,则将下载该文件夹下的所有文件
        :param local_path:本地路径。 如果它已经存在并且是目录,则文件将在其中下载。
        :return: local_path_return:成功后,此方法将返回本地下载路径
        '''
        local_path_return = self.client.download(hdfs_path=file_path,
                                                 local_path=local_path)
        return local_path_return

    def delete_files(self, file_path):
        '''
        从HDFS中删除文件或目录
        :param file_path: HDFS中需要删除的文件或目录的路径
        :return:如果删除成功,则此函数返回“ True”,如果先前在“ hdfs_path”处不存在文件或目录,则返回“ False”。
        '''
        # recursive:递归删除文件和目录。 默认情况下,如果尝试删除非空目录,则此方法将引发HdfsError。
        # skip_trash:设置为false时,已删除的路径将被移至相应的垃圾文件夹,而不是被删除。 这需要Hadoop 2.9+且在集群上启用trash
        return self.client.delete(hdfs_path=file_path,
                                  recursive=False,
                                  skip_trash=True)

    def set_files_permission(self, file_path):
        '''
        更改文件的权限
        :param file_path: 需要更改权限的文件路径
        :return:
        '''
        # permission:文件的新八进制权限字符串
        self.client.set_permission(hdfs_path=file_path, permission=None)