Exemple #1
0
def Run(jobTime):
    Client = HdfsClient(hosts=HDFS)
    Log = LogTime(jobTime)
    if Client.exists(Log.HPath + Log.success):
        r.rset(Rkey,jobTime,1)
        logger.info("-------- %s -----------" % jobTime)
        for dm in Domains:
            Files = Log.HPath + dm + Log.LogName
            if Client.exists(Files):
                Dfile = DOWN_DIR + '/' + dm + Log.UpFile
                TmpFile = dm + Log.UpTmp()
                try:
                    sts = time.time()
                    logger.info('DownloadStart... %s' % Files)
                    Client.copy_to_local(Files,Dfile)
                    logger.info('DownloadSuccess... %s %s' % (Files,Dfile))
                    Upload(TmpFile,Dfile,Log.PathDay)
                    Rename(Dday + "/" + TmpFile)
                    logger.info('UploadSuccess... %s' % Dfile)
                    ets = time.time()
                    Write(jobTime,dm,sts,ets,200)
                    r.rdel(Rkey,jobTime)
                except Exception,e:
                    Write(jobTime,dm,500)
                    r.rdel(Rkey,jobTime)
                    logger.error(e)
            else:
                logger.warn(Files + ' Non-existent')
                continue
        r.rdel(Rkey,jobTime)
Exemple #2
0
def ProcAll(LocalDir, HdfsDir):
    NameNode = GolobalConfig['hdfs']['NameNode']
    UserName = GolobalConfig['hdfs']['UserName']
    client = HdfsClient(hosts=NameNode, user_name=UserName)
    if not client.exists(HdfsDir):
        client.mkdirs(HdfsDir)
    total = len(os.listdir(LocalDir))
    processed = 0
    failedList = list()
    FileSize = 0
    StartTime = time.time()
    for filename in os.listdir(LocalDir):
        srcFile = os.path.join(LocalDir, filename)
        dstFile = HdfsDir + '/' + filename
        if not ProcOne(client, srcFile, dstFile):
            failedList.append(srcFile)
        else:
            FileSize += os.path.getsize(srcFile)
        processed += 1
        print('%d/%d/%d, time cost: %.2f s' %
              (total, processed, len(failedList), time.time() - StartTime))
        print('%d B, %.2f MB/s \n' % (FileSize, FileSize / 1024 / 1024 /
                                      (time.time() - StartTime)))

    if failedList:
        print('failedList: %s' % repr(failedList))
        return False
    else:
        print('Good! No Error!')
        print('%d B, %.2f MB, %.2f GB, %.2f MB/s' % \
          (FileSize, FileSize/1024/1024, FileSize/1024/1024/1024, FileSize/1024/1024/(time.time()-StartTime)))
        return True
Exemple #3
0
class HDFSSErvice:

    namenode_host = "localhost"
    namenode_port = "9870"
    root_folder = "/"
    chunck_size = 100000

    def __init__(self):
        self._client = HdfsClient(hosts=self.namenode_host + ":" + self.namenode_port, user_name="root")

    def get(self, hdfs_path: str):
        file_size = self.get_file_size(hdfs_path)
        for i in range(0, file_size, self.chunck_size):
            file_response = self._client.open(hdfs_path, offset=i, length=i + self.chunck_size)
            yield file_response.read()
        
    def append(self, hdfs_path: str, data: bytes):
        self.create_if_not_exist(hdfs_path)
        self._client.append(hdfs_path, data)
    
    def create_if_not_exist(self, hdfs_path: str):
        if not self._client.exists(hdfs_path):
            self._client.create(hdfs_path, b"")

    def get_messages_number(self, hdfs_path: str):
        return int(self.get_file_size(hdfs_path) / self.chunck_size + 1)

    def get_file_size(self, hdfs_path):
        file_infos = self._client.get_content_summary(hdfs_path)
        return file_infos.length

    def test(self):
        pass
Exemple #4
0
def update_csv():
    local = '/Users/constantine/PycharmProjects/test02/data.csv'
    tmpLocal = '/Users/constantine/PycharmProjects/test02/tmpdata.csv'
    remote = '/data/data.csv'
    host = '127.0.0.1:9870'
    user_name = 'host'
    client = HdfsClient(hosts=host,user_name=user_name)
    if client.exists(remote):
        client.copy_to_local(remote,tmpLocal)
        client.delete(remote)
        fRead = open(local,'r')
        fWrite = open(tmpLocal,'w')
        lines = fRead.readlines()

        for line in lines:
            fWrite.writelines(lines)
        fRead.close()
        fWrite.close()
        fRead = open(local, 'r')
        lines = fRead.read()
        fRead.close()
        fWrite = open(tmpLocal, 'w')
        lines = '\n'.join(list(set(lines.split('\n')))[1:])
        fWrite.write(lines)
        fWrite.close()
        client.copy_from_local(tmpLocal,remote)


    else:
        client.copy_from_local(local, remote)
Exemple #5
0
    def load_fields_with_vocab(self, hdfs_host: str) -> Dict[str, Field]:
        fs = HdfsClient(hdfs_host)
        if fs.exists(self.fields_path):
            print(f'get fields from {hdfs_host}{self.fields_path}')
        else:
            raise Exception(f'there are no fields in {hdfs_host}{self.fields_path}')

        loaded_dict = json.loads(fs.open(self.fields_path).read())
        return {k: self.dict_to_field(v) for k, v in loaded_dict.items()}
Exemple #6
0
 def load_fields_from_c3(self) -> Dict[str, Field]:
     fs = HdfsClient(self.C3_HDFS_HOST, user_name=self.user_name)
     if fs.exists(self.c3_fields_path):
         print(f'get fields from {self.C3_HDFS_HOST}{self.c3_fields_path}')
     else:
         raise Exception(f'there are no fields in {self.C3_HDFS_HOST}{self.c3_fields_path}')
     loaded_dict = json.loads(fs.open(self.c3_fields_path).read())
     print(loaded_dict)
     max_vocab_indexes = {k: v['max_vocab_index'] for k, v in loaded_dict.items()}
     return {k: self.dict_to_field(v) for k, v in loaded_dict.items()}, max_vocab_indexes
def Copy_From_Local(file):
    '''
    上传文件到hadoop
    '''
    h_file = ('/tmp/te/%s' % file)
    client = HdfsClient(hosts='localhost:50070')  #hdfs地址,连接hdfs
    if client.exists(h_file):
        client.delete(h_file)
        #判断文件是否存在于hdfs,存在就删除
    client.copy_from_local(file, h_file)
Exemple #8
0
def basic():
	client = HdfsClient(hosts='study:50070')
	print(client.list_status('/'))

	print '判断某个路径是否存在'
	print client.exists("/test")
	print client.exists("/data/gz/thrift-0.9.2.tar.gz")

	client = HdfsClient(hosts='study:50070')
	print client.get_file_checksum("/data/gz/bison-2.5.1.tar.gz")

	summary = client.get_content_summary("/")
	print summary

	#文件拷贝--从HDFS拷贝到本地磁盘系统
	client.copy_to_local("/data/gz/pip-7.1.2.tar.gz","/root/data/pip-7.1.2.tar.gz")
	#文件拷贝--从本地磁盘系统拷贝到HDFS系统中
	client.copy_from_local("/root/data/thrift-0.9.2.tar.gz","/data/gz/thrift-0.9.2.tar.gz")

	print client.get_home_directory()
Exemple #9
0
def save_img(path, corlor_pic):
    # 创建HDFS连接客户端
    client = HdfsClient(hosts="192.168.2.109", user_name="hadoop")
    # 读取本地图片(也可自己通过numpy模块生成)
    #     mat = cv2.imread(r"C:\Users\HUAWEI\Pictures\1.png")
    corlor_pic = cv2.resize(
        corlor_pic, (corlor_pic.shape[1] // 1, corlor_pic.shape[0] // 1))
    # hdfs保存路径
    # 写入hdfs
    if client.exists(path):
        client.delete(path)
    client.create(path, cv2.imencode('.png', corlor_pic)[1].tobytes())
Exemple #10
0
 def load_matrix(self, filepath, shape=None):
     if os.environ['local'] == '1' and os.path.exists(filepath):
         return np.loadtxt(filepath, dtype=np.float)
     else:
         hosts = os.environ['hosts']
         if len(hosts) == 0:
             hosts = 'master'
         client = HdfsClient(hosts=hosts)
         if client.exists(filepath):
             return np.fromstring(
                 client.open(filepath).read()).reshape(shape)
     return False
Exemple #11
0
    def _load_preprocessed(self) -> List[Example]:
        fs = HdfsClient(self.C3_HDFS_HOST, user_name=self.user_name)
        if fs.exists(self.c3_path):
            print(f'get preprocessed corpus from {self.C3_HDFS_HOST}{self.c3_path}')
        else:
            raise Exception(f'there are no preprocessed in {self.C3_HDFS_HOST}{self.c3_path}')

        preprocessed = []
        for line in fs.open(self.c3_path).read().decode().split('\n'):
            if line:
                ex = Example()
                for k, v in json.loads(line).items():
                    setattr(ex, k, v)
                preprocessed.append(ex)
        return preprocessed
Exemple #12
0
def test_read_files_with_spaces(started_cluster):
    hdfs_api = started_cluster.hdfs_api

    fs = HdfsClient(hosts=started_cluster.hdfs_ip)
    dir = '/test_spaces'
    exists = fs.exists(dir)
    if exists:
        fs.delete(dir, recursive=True)
    fs.mkdirs(dir)

    hdfs_api.write_data(f"{dir}/test test test 1.txt", "1\n")
    hdfs_api.write_data(f"{dir}/test test test 2.txt", "2\n")
    hdfs_api.write_data(f"{dir}/test test test 3.txt", "3\n")

    node1.query(f"create table test (id UInt32) ENGINE = HDFS('hdfs://hdfs1:9000/{dir}/test*', 'TSV')")
    assert node1.query("select * from test order by id") == "1\n2\n3\n"
    fs.delete(dir, recursive=True)
Exemple #13
0
def test_hdfsCluster(started_cluster):
    hdfs_api = started_cluster.hdfs_api
    fs = HdfsClient(hosts=started_cluster.hdfs_ip)
    dir = '/test_hdfsCluster'
    exists = fs.exists(dir)
    if exists:
        fs.delete(dir, recursive=True)
    fs.mkdirs(dir)
    hdfs_api.write_data("/test_hdfsCluster/file1", "1\n")
    hdfs_api.write_data("/test_hdfsCluster/file2", "2\n")
    hdfs_api.write_data("/test_hdfsCluster/file3", "3\n")

    actual = node1.query("select id, _file as file_name, _path as file_path from hdfs('hdfs://hdfs1:9000/test_hdfsCluster/file*', 'TSV', 'id UInt32') order by id")
    expected = "1\tfile1\thdfs://hdfs1:9000/test_hdfsCluster/file1\n2\tfile2\thdfs://hdfs1:9000/test_hdfsCluster/file2\n3\tfile3\thdfs://hdfs1:9000/test_hdfsCluster/file3\n"
    assert actual == expected

    actual = node1.query("select id, _file as file_name, _path as file_path from hdfsCluster('test_cluster_two_shards', 'hdfs://hdfs1:9000/test_hdfsCluster/file*', 'TSV', 'id UInt32') order by id")
    expected = "1\tfile1\thdfs://hdfs1:9000/test_hdfsCluster/file1\n2\tfile2\thdfs://hdfs1:9000/test_hdfsCluster/file2\n3\tfile3\thdfs://hdfs1:9000/test_hdfsCluster/file3\n"
    assert actual == expected
    fs.delete(dir, recursive=True)
Exemple #14
0
            x['pathSuffix'] for x in client.list_status(dir_dataroot)
            if x['type'] == 'DIRECTORY'
        ]
        list_dirs.sort()
        for subdir in list_dirs:
            dir_subdata = os.path.join(dir_dataroot, subdir)
            logger.debug('data path : %s' % dir_subdata)
            dir_subdata_cleaned = os.path.join(dir_subdata, 'cleaned4netsec')
            logger.debug('data path for cleaned files : %s' %
                         dir_subdata_cleaned)
            list_subdir_date = [
                x['pathSuffix'] for x in client.list_status(dir_subdata)
                if x['type'] == 'FILE'
            ]
            if len(list_subdir_date) > 0:
                if not client.exists(dir_subdata_cleaned):
                    client.mkdirs(dir_subdata_cleaned)
                    logger.debug('mkdir dir for cleaned files : %s' %
                                 dir_subdata_cleaned)

            list_subdir_date_cleaned = [
                x['pathSuffix']
                for x in client.list_status(dir_subdata_cleaned)
                if x['type'] == 'FILE'
            ]

            list_subdir_date.sort()
            for fname in list_subdir_date:
                if fname in list_subdir_date_cleaned:
                    #                     #TODO: to debug
                    #                     if client.exists(os.path.join(dir_subdata_cleaned, fname)):
Exemple #15
0
class hdfs(object):
    #默认50070端口
    def __init__(self, cur_database_param):
        # super(HdfsClients, self).__init__()
        # self.quert_db_info = super(HdfsClients, self).getDBConfig()
        # self.hdfsHost=self.quert_db_info["host"]
        hdfsHost = cur_database_param['url']
        path = cur_database_param['dbname']
        self.hdfs = HdfsClient(hosts='{hdfs_host}'.format(hdfs_host=hdfsHost))
        self.host = hdfsHost
        self.path = path

    def append(self, path, data):
        self.hdfs.append(path, data)
        pass

    def concat(self, target, sources):
        self.concat(target, sources)

    # self, taskJobId,tableName=None,jobTemplateFieldList=None
    def createTableByTaskJobId(self,
                               taskJobId,
                               tableName=None,
                               jobTemplateFieldList=None,
                               data=None):
        if tableName == None:
            taskJob = TaskJobDao.loadTaskById(taskJobId)
            tableName = taskJob.tableName
        path = self.path + '/' + tableName
        self.hdfs.create(path, data, replication=2)

    def hmkdirs(self, path):
        self.hdfs.mkdirs(path)

    def open(self, path):
        return self.hdfs.open(path=path)

    def delete(self, path):
        self.hdfs.delete(path=path)

    def listdir(self, rule):
        f = self.hdfs.listdir(rule)
        return f

    def insert(self, jobid, tablename, column_dict, paramMap=None):
        if tablename == None:
            taskJob = TaskJobDao.loadTaskById(jobid)
            tablename = taskJob.tableName
        path = self.path + '/' + tablename
        createTime = time.strftime('%Y-%m-%d %H:%M:%S',
                                   time.localtime(time.time()))
        task_job_id_sequenceValue = paramMap.get(
            "task_job_id_sequence") if paramMap != None else None
        if task_job_id_sequenceValue != None:
            column_dict.update(
                {"task_job_id_sequence": str(task_job_id_sequenceValue)})
        column_dict.update({
            "task_job_del_flag": "False",
            "task_job_create_time": createTime
        })
        # self.append(path, column_dict)
        if self.isTableExist(tablename):
            self.append(path, column_dict)
        else:
            self.createTableByTaskJobId(jobid, tablename, column_dict)
        # return column_dict

    def isTableExist(self, tablename):
        path = self.path + '/' + tablename
        exist = self.hdfs.exists(path)
        return exist

    def save_to_hdfs(self, jobid, path, data):
        if self.isTableExist(path):
            self.append(path, data)
        else:
            self.createTableByTaskJobId(jobid, path, data)

    def save_to_hdfs2(self, path, data):
        if self.hdfs.exists(path):
            self.hdfs.append(path, data)
        else:
            self.hdfs.create(path, data, replication=2)

    def execute(self, sqls="append", path=None, data=None):
        try:
            if isinstance(sqls, list) and len(sqls) > 0:
                for sql in sqls:
                    # method = eval(sql)
                    method = getattr(self, sql)
                    method(path, data)
            else:
                # method = eval(sqls)
                method = getattr(self, sqls)
                method(path, data)
        except Exception, e:
            logging.error("hdfs,execute," + str(e))
            raise Exception()
Exemple #16
0
def yesterday():
    return today() - datetime.timedelta(days=1)


# 执行主方法
if __name__ == '__main__':
    print "监控HDFS......"
    yesterday_datetime_format = yesterday()
    for table in CHECK_TABLE:
        is_success = False
        has_data = False
        content = ""
        try:
            path = ROOT_DIR + table + "/" + str(yesterday_datetime_format)
            if client.exists(path):
                client_list = client.list_status(path)
                for file_status in client_list:
                    if (file_status.get("pathSuffix").startswith('part-')) and (int(file_status.get("length")) > 0):
                        has_data = True
                    elif file_status.get("pathSuffix").__eq__("_SUCCESS"):
                        is_success = True
            else:
                content = "异常信息:HDFS路径不存在 <br>" + \
                          str("HDFS路径:") + path
        except Exception, e:
            content = "异常信息:" + str(e) + "<br>" + \
                      str("HDFS路径:") + path

        if (content == "") and (not is_success):
            content = "异常信息:" + table + "相关job运行失败" + "<br>" + \